summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c176
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/cpu.c61
-rw-r--r--kernel/cpuset.c66
-rw-r--r--kernel/debug/debug_core.c19
-rw-r--r--kernel/debug/kdb/kdb_io.c46
-rw-r--r--kernel/debug/kdb/kdb_main.c18
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c517
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/gcov/Makefile36
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/manage.c137
-rw-r--r--kernel/irq/msi.c11
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/proc.c11
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1003
-rw-r--r--kernel/locking/Makefile11
-rw-r--r--kernel/locking/lockdep.c81
-rw-r--r--kernel/locking/mcs_spinlock.h22
-rw-r--r--kernel/locking/mutex.c105
-rw-r--r--kernel/locking/osq_lock.c (renamed from kernel/locking/mcs_spinlock.c)21
-rw-r--r--kernel/locking/rtmutex.c13
-rw-r--r--kernel/locking/rwsem-spinlock.c9
-rw-r--r--kernel/locking/rwsem-xadd.c101
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/locking/spinlock.c8
-rw-r--r--kernel/module.c170
-rw-r--r--kernel/notifier.c3
-rw-r--r--kernel/padata.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c3
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/process.c75
-rw-r--r--kernel/power/qos.c91
-rw-r--r--kernel/power/snapshot.c32
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/printk/console_cmdline.h2
-rw-r--r--kernel/printk/printk.c15
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcu/Makefile3
-rw-r--r--kernel/rcu/rcu.h6
-rw-r--r--kernel/rcu/rcutorture.c66
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c113
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c355
-rw-r--r--kernel/rcu/tree.h62
-rw-r--r--kernel/rcu/tree_plugin.h277
-rw-r--r--kernel/rcu/tree_trace.c8
-rw-r--r--kernel/resource.c25
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c31
-rw-r--r--kernel/sched/core.c388
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c158
-rw-r--r--kernel/sched/debug.c13
-rw-r--r--kernel/sched/fair.c446
-rw-r--r--kernel/sched/features.h13
-rw-r--r--kernel/sched/idle.c66
-rw-r--r--kernel/sched/rt.c207
-rw-r--r--kernel/sched/sched.h136
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/taskstats.c13
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/Makefile8
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c229
-rw-r--r--kernel/time/clocksource.c249
-rw-r--r--kernel/time/hrtimer.c125
-rw-r--r--kernel/time/jiffies.c7
-rw-r--r--kernel/time/ntp.c27
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c11
-rw-r--r--kernel/time/tick-broadcast.c179
-rw-r--r--kernel/time/tick-common.c120
-rw-r--r--kernel/time/tick-internal.h211
-rw-r--r--kernel/time/tick-oneshot.c6
-rw-r--r--kernel/time/tick-sched.c18
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c542
-rw-r--r--kernel/time/timekeeping.h9
-rw-r--r--kernel/time/timer.c149
-rw-r--r--kernel/time/timer_list.c34
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c40
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c40
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_seq.c2
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c929
126 files changed, 6048 insertions, 3138 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
def_bool y
depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+ def_bool y
+ depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
config ARCH_USE_QUEUE_RWLOCK
bool
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_irq_work.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
# cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
obj-y += printk/
obj-y += irq/
obj-y += rcu/
+obj-y += livepatch/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
kernel/system_certificates.o: $(obj)/x509_certificate_list
quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)")
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
targets += $(obj)/x509_certificate_list
$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct);
struct bsd_acct_struct {
struct fs_pin pin;
+ atomic_long_t count;
+ struct rcu_head rcu;
struct mutex lock;
int active;
unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
struct completion done;
};
+static void do_acct_process(struct bsd_acct_struct *acct);
+
/*
* Check the amount of free space and suspend/resume accordingly.
*/
@@ -124,32 +127,56 @@ out:
return acct->active;
}
+static void acct_put(struct bsd_acct_struct *p)
+{
+ if (atomic_long_dec_and_test(&p->count))
+ kfree_rcu(p, rcu);
+}
+
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+ return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
- res = ACCESS_ONCE(ns->bacct);
+ res = to_acct(ACCESS_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
}
- if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ if (!atomic_long_inc_not_zero(&res->count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (!res->ns) {
+ if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
- pin_put(&res->pin);
+ acct_put(res);
goto again;
}
return res;
}
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct = to_acct(pin);
+ mutex_lock(&acct->lock);
+ do_acct_process(acct);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ cmpxchg(&acct->ns->bacct, pin, NULL);
+ mutex_unlock(&acct->lock);
+ pin_remove(pin);
+ acct_put(acct);
+}
+
static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
complete(&acct->done);
}
-static void acct_kill(struct bsd_acct_struct *acct,
- struct bsd_acct_struct *new)
-{
- if (acct) {
- struct pid_namespace *ns = acct->ns;
- do_acct_process(acct);
- INIT_WORK(&acct->work, close_work);
- init_completion(&acct->done);
- schedule_work(&acct->work);
- wait_for_completion(&acct->done);
- pin_remove(&acct->pin);
- ns->bacct = new;
- acct->ns = NULL;
- atomic_long_dec(&acct->pin.count);
- mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
- }
-}
-
-static void acct_pin_kill(struct fs_pin *pin)
-{
- struct bsd_acct_struct *acct;
- acct = container_of(pin, struct bsd_acct_struct, pin);
- mutex_lock(&acct->lock);
- if (!acct->ns) {
- mutex_unlock(&acct->lock);
- pin_put(pin);
- acct = NULL;
- }
- acct_kill(acct, NULL);
-}
-
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
- struct bsd_acct_struct *acct, *old;
+ struct bsd_acct_struct *acct;
+ struct fs_pin *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
mnt = file->f_path.mnt;
file->f_path.mnt = internal;
- atomic_long_set(&acct->pin.count, 1);
- acct->pin.kill = acct_pin_kill;
+ atomic_long_set(&acct->count, 1);
+ init_fs_pin(&acct->pin, acct_pin_kill);
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
- old = acct_get(ns);
- if (old)
- acct_kill(old, acct);
- else
- ns->bacct = acct;
+ rcu_read_lock();
+ old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
+ pin_kill(old);
mnt_drop_write(mnt);
mntput(mnt);
return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- acct_kill(acct_get(task_active_pid_ns(current)), NULL);
+ rcu_read_lock();
+ pin_kill(task_active_pid_ns(current)->bacct);
}
return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
void acct_exit_ns(struct pid_namespace *ns)
{
- acct_kill(acct_get(ns), NULL);
+ rcu_read_lock();
+ pin_kill(ns->bacct);
}
/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
+ acct_put(acct);
}
}
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
-/* 0 = no checking
- 1 = put_count checking
- 2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
};
};
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
@@ -86,7 +79,6 @@ struct audit_names {
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
- bool name_put; /* call __putname()? */
unsigned long ino;
dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
};
int fds[2];
struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
};
extern u32 audit_ever_enabled;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f68a326d92e..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_nofree;
bufp = data->buf;
- entry->rule.vers_ops = 2;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
return ERR_PTR(-ENOMEM);
new = &entry->rule;
- new->vers_ops = old->vers_ops;
new->flags = old->flags;
new->pflags = old->pflags;
new->listnr = old->listnr;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566dd0caf..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
-#if AUDIT_DEBUG == 2
- if (context->put_count + context->ino_count != context->name_count) {
- int i = 0;
-
- pr_err("%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d ino_count=%d"
- " [NOT freeing]\n", __FILE__, __LINE__,
- context->serial, context->major, context->in_syscall,
- context->name_count, context->put_count,
- context->ino_count);
- list_for_each_entry(n, &context->names_list, list) {
- pr_err("names[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
- dump_stack();
- return;
- }
-#endif
-#if AUDIT_DEBUG
- context->put_count = 0;
- context->ino_count = 0;
-#endif
-
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
- if (n->name && n->name_put)
- final_putname(n->name);
+ if (n->name)
+ putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
return aname;
}
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr)
+ if (n->name->uptr == uptr) {
+ n->name->refcnt++;
return n->name;
+ }
}
return NULL;
}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
struct audit_context *context = current->audit_context;
struct audit_names *n;
- if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): ignoring getname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- dump_stack();
-#endif
+ if (!context->in_syscall)
return;
- }
-
-#if AUDIT_DEBUG
- /* The filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
- n->name_put = true;
name->aname = n;
+ name->refcnt++;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
- struct audit_context *context = current->audit_context;
-
- BUG_ON(!context);
- if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): final_putname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- if (context->name_count) {
- struct audit_names *n;
- int i = 0;
-
- list_for_each_entry(n, &context->names_list, list)
- pr_err("name[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
-#endif
- final_putname(name);
- }
-#if AUDIT_DEBUG
- else {
- ++context->put_count;
- if (context->put_count > context->name_count) {
- pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
- " name_count=%d put_count=%d\n",
- __FILE__, __LINE__,
- context->serial, context->major,
- context->in_syscall, name->name,
- context->name_count, context->put_count);
- dump_stack();
- }
- }
-#endif
-}
-
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
if (!name)
goto out_alloc;
-#if AUDIT_DEBUG
- /* The struct filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
/*
* If we have a pointer to an audit_names entry already, then we can
* just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (!n->name || strcmp(n->name->name, name->name))
+ if (n->ino) {
+ /* valid inode number, use that for the comparison */
+ if (n->ino != inode->i_ino ||
+ n->dev != inode->i_sb->s_dev)
+ continue;
+ } else if (n->name) {
+ /* inode number has not been set, check the name */
+ if (strcmp(n->name->name, name->name))
+ continue;
+ } else
+ /* no inode and no name (?!) ... this is odd ... */
continue;
/* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
- /* unfortunately, while we may have a path name to record with the
- * inode, we can't always rely on the string lasting until the end of
- * the syscall so we need to create our own copy, it may fail due to
- * memory allocation issues, but we do our best */
if (name) {
- /* we can't use getname_kernel() due to size limits */
- size_t len = strlen(name->name) + 1;
- struct filename *new = __getname();
-
- if (unlikely(!new))
- goto out;
-
- if (len <= (PATH_MAX - sizeof(*new))) {
- new->name = (char *)(new) + sizeof(*new);
- new->separate = false;
- } else if (len <= PATH_MAX) {
- /* this looks odd, but is due to final_putname() */
- struct filename *new2;
-
- new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
- if (unlikely(!new2)) {
- __putname(new);
- goto out;
- }
- new2->name = (char *)new;
- new2->separate = true;
- new = new2;
- } else {
- /* we should never get here, but let's be safe */
- __putname(new);
- goto out;
- }
- strlcpy((char *)new->name, name->name, len);
- new->uptr = NULL;
- new->aname = n;
- n->name = new;
- n->name_put = true;
+ n->name = name;
+ name->refcnt++;
}
+
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
/* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name || n->type != AUDIT_TYPE_PARENT)
+ if (!n->name ||
+ (n->type != AUDIT_TYPE_PARENT &&
+ n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname,
+ n->name->name, n->name_len)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
break;
}
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
- if (!n->name || n->type != type)
- continue;
-
- /* if we found a parent, make sure this one is a child of it */
- if (found_parent && (n->name != found_parent->name))
+ if (!n->name ||
+ (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
found_parent ?
found_parent->name_len :
AUDIT_NAME_FULL)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = type;
found_child = n;
break;
}
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- /* don't call __putname() */
- found_child->name_put = false;
+ found_child->name->refcnt++;
}
}
+
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
struct audit_aux_data_bprm_fcaps *ax;
struct audit_context *context = current->audit_context;
struct cpu_vfs_cap_data vcaps;
- struct dentry *dentry;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_path.dentry);
- get_vfs_caps_from_disk(dentry, &vcaps);
- dput(dentry);
+ get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- module_free(NULL, hdr);
+ module_memfree(hdr);
}
#endif /* CONFIG_BPF_JIT */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 088ac0b1b106..536edc2be307 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct fd f = fdget(ufd);
struct bpf_map *map;
- void *key, *value;
+ void *key, *value, *ptr;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
@@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
- err = -ENOENT;
- rcu_read_lock();
- value = map->ops->map_lookup_elem(map, key);
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
if (!value)
- goto err_unlock;
+ goto free_key;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, map->value_size);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!ptr)
+ goto free_value;
err = -EFAULT;
if (copy_to_user(uvalue, value, map->value_size) != 0)
- goto err_unlock;
+ goto free_value;
err = 0;
-err_unlock:
- rcu_read_unlock();
+free_value:
+ kfree(value);
free_key:
kfree(key);
err_put:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bb263d0caab3..a220fdb66568 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb)
*
* And don't kill the default root.
*/
- if (css_has_online_children(&root->cgrp.self) ||
+ if (!list_empty(&root->cgrp.self.children) ||
root == &cgrp_dfl_root)
cgroup_put(&root->cgrp);
else
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft), 0, cft->kf_ops, cft,
- NULL, false, key);
+ NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count)
static void pidlist_free(void *p)
{
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- kfree(p);
+ kvfree(p);
}
/*
@@ -4373,16 +4370,20 @@ static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
- if (css->ss) {
+ if (ss) {
/* css free path */
+ int id = css->id;
+
if (css->parent)
css_put(css->parent);
- css->ss->css_free(css);
+ ss->css_free(css);
+ cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
} else {
/* cgroup free path */
@@ -4434,7 +4435,7 @@ static void css_release_work_fn(struct work_struct *work)
if (ss) {
/* css release path */
- cgroup_idr_remove(&ss->css_idr, css->id);
+ cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
} else {
@@ -5036,6 +5037,9 @@ int __init cgroup_init(void)
WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
}
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
* core implementation decides to return random nonsense.
*/
if (ret == -ERESTART_RESTARTBLOCK) {
- struct restart_block *restart
- = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
restart->fn = compat_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
return -EFAULT;
if (err == -ERESTART_RESTARTBLOCK) {
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = compat_clock_nanosleep_restart;
restart->nanosleep.compat_rmtp = rmtp;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..82eea9c5af61 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <linux/tick.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -58,22 +59,23 @@ static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
+ /* wait queue to wake up the active_writer */
+ wait_queue_head_t wq;
+ /* verifies that no writer will get active while readers are active */
+ struct mutex lock;
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
- int refcount;
- /* And allows lockless put_online_cpus(). */
- atomic_t puts_pending;
+ atomic_t refcount;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} cpu_hotplug = {
.active_writer = NULL,
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- .refcount = 0,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
.dep_map = {.name = "cpu_hotplug.lock" },
#endif
@@ -86,15 +88,6 @@ static struct {
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
-static void apply_puts_pending(int max)
-{
- int delta;
-
- if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
- delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
- cpu_hotplug.refcount -= delta;
- }
-}
void get_online_cpus(void)
{
@@ -103,8 +96,7 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
- apply_puts_pending(65536);
- cpu_hotplug.refcount++;
+ atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +108,7 @@ bool try_get_online_cpus(void)
if (!mutex_trylock(&cpu_hotplug.lock))
return false;
cpuhp_lock_acquire_tryread();
- apply_puts_pending(65536);
- cpu_hotplug.refcount++;
+ atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
return true;
}
@@ -125,20 +116,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
void put_online_cpus(void)
{
+ int refcount;
+
if (cpu_hotplug.active_writer == current)
return;
- if (!mutex_trylock(&cpu_hotplug.lock)) {
- atomic_inc(&cpu_hotplug.puts_pending);
- cpuhp_lock_release();
- return;
- }
- if (WARN_ON(!cpu_hotplug.refcount))
- cpu_hotplug.refcount++; /* try to fix things up */
+ refcount = atomic_dec_return(&cpu_hotplug.refcount);
+ if (WARN_ON(refcount < 0)) /* try to fix things up */
+ atomic_inc(&cpu_hotplug.refcount);
+
+ if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+ wake_up(&cpu_hotplug.wq);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);
cpuhp_lock_release();
}
@@ -168,18 +157,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
*/
void cpu_hotplug_begin(void)
{
- cpu_hotplug.active_writer = current;
+ DEFINE_WAIT(wait);
+ cpu_hotplug.active_writer = current;
cpuhp_lock_acquire();
+
for (;;) {
mutex_lock(&cpu_hotplug.lock);
- apply_puts_pending(1);
- if (likely(!cpu_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (likely(!atomic_read(&cpu_hotplug.refcount)))
+ break;
mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+ finish_wait(&cpu_hotplug.wq, &wait);
}
void cpu_hotplug_done(void)
@@ -348,6 +339,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Give up timekeeping duties */
+ tick_handover_do_timer();
/* Park the stopper thread */
kthread_park(current);
return 0;
@@ -421,10 +414,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
while (!idle_cpu(cpu))
cpu_relax();
+ hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
/* CPU is completely dead: tell everyone. Too late to complain. */
+ tick_cleanup_dead_cpu(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
check_for_tasks(cpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..c68f0721df10 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -548,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs)
- continue;
-
/* skip the whole subtree if @cp doesn't have any CPU */
if (cpumask_empty(cp->cpus_allowed)) {
pos_css = css_rightmost_descendant(pos_css);
@@ -625,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -634,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
+ if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+ goto done;
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -646,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.effective_cpus);
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ non_isolated_cpus);
goto done;
}
@@ -669,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
* the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- !is_sched_load_balance(cp))
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
continue;
if (is_sched_load_balance(cp))
@@ -751,6 +755,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
+ cpumask_and(dp, dp, non_isolated_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -763,6 +768,7 @@ restart:
BUG_ON(nslot != ndoms);
done:
+ free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -873,7 +879,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cpumask_empty(new_cpus))
+ if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -1129,7 +1135,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (nodes_empty(*new_mems))
+ if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1707,40 +1713,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
- ssize_t count;
- char *buf, *s;
int ret = 0;
- count = seq_get_buf(sf, &buf);
- s = buf;
-
spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
- s += cpulist_scnprintf(s, count, cs->cpus_allowed);
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
break;
case FILE_MEMLIST:
- s += nodelist_scnprintf(s, count, cs->mems_allowed);
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
break;
case FILE_EFFECTIVE_CPULIST:
- s += cpulist_scnprintf(s, count, cs->effective_cpus);
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
break;
case FILE_EFFECTIVE_MEMLIST:
- s += nodelist_scnprintf(s, count, cs->effective_mems);
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
default:
ret = -EINVAL;
- goto out_unlock;
}
- if (s < buf + count - 1) {
- *s++ = '\n';
- seq_commit(sf, s - buf);
- } else {
- seq_commit(sf, -1);
- }
-out_unlock:
spin_unlock_irq(&callback_lock);
return ret;
}
@@ -1992,7 +1985,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
@@ -2400,7 +2395,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
*/
}
-void cpuset_init_current_mems_allowed(void)
+void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
}
@@ -2610,8 +2605,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
-#define CPUSET_NODELIST_LEN (256)
-
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @tsk: pointer to task_struct of some task.
@@ -2621,23 +2614,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- /* Statically allocated to prevent using excess stack. */
- static char cpuset_nodelist[CPUSET_NODELIST_LEN];
- static DEFINE_SPINLOCK(cpuset_buffer_lock);
struct cgroup *cgrp;
- spin_lock(&cpuset_buffer_lock);
rcu_read_lock();
cgrp = task_cs(tsk)->css.cgroup;
- nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
- tsk->mems_allowed);
pr_info("%s cpuset=", tsk->comm);
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+ pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
rcu_read_unlock();
- spin_unlock(&cpuset_buffer_lock);
}
/*
@@ -2715,10 +2701,8 @@ out:
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_puts(m, "Mems_allowed:\t");
- seq_nodemask(m, &task->mems_allowed);
- seq_puts(m, "\n");
- seq_puts(m, "Mems_allowed_list:\t");
- seq_nodemask_list(m, &task->mems_allowed);
- seq_puts(m, "\n");
+ seq_printf(m, "Mems_allowed:\t%*pb\n",
+ nodemask_pr_args(&task->mems_allowed));
+ seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+ nodemask_pr_args(&task->mems_allowed));
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 07ce18ca71e0..0874e2edd275 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -604,7 +604,7 @@ return_normal:
online_cpus)
cpu_relax();
if (!time_left)
- pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
+ pr_crit("Timed out waiting for secondary CPUs.\n");
/*
* At this point the primary processor is completely
@@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (arch_kgdb_ops.enable_nmi)
arch_kgdb_ops.enable_nmi(0);
+ /*
+ * Avoid entering the debugger if we were triggered due to an oops
+ * but panic_timeout indicates the system should automatically
+ * reboot on panic. We don't want to get stuck waiting for input
+ * on such systems, especially if its "just" an oops.
+ */
+ if (signo != SIGTRAP && panic_timeout)
+ return 1;
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = raw_smp_processor_id();
@@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self,
unsigned long val,
void *data)
{
+ /*
+ * Avoid entering the debugger if we were triggered due to a panic
+ * We don't want to get stuck waiting for input from user in such case.
+ * panic_timeout indicates the system should automatically
+ * reboot on panic.
+ */
+ if (panic_timeout)
+ return NOTIFY_DONE;
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", (char *)data);
kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..fc1ef736253c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -439,7 +439,7 @@ poll_again:
* substituted for %d, %x or %o in the prompt.
*/
-char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
-int vkdb_printf(const char *fmt, va_list ap)
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
int linecount;
@@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap)
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ /*
+ * This was a interactive search (using '/' at more
+ * prompt) and it has completed. Clear the flag.
+ */
+ kdb_grepping_flag = 0;
/*
* at this point the string is a full line and
* should be printed, up to the null.
@@ -691,19 +697,20 @@ kdb_printit:
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
+ cp = (char *) printk_skip_level(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
- gdbstub_msg_write(kdb_buffer, retlen);
+ gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
} else {
if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen;
- cp = kdb_buffer;
+ len = retlen - (cp - kdb_buffer);
+ cp2 = cp;
while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
+ dbg_io_ops->write_char(*cp2);
+ cp2++;
}
}
while (c) {
- c->write(c, kdb_buffer, retlen);
+ c->write(c, cp, retlen - (cp - kdb_buffer));
touch_nmi_watchdog();
c = c->next;
}
@@ -711,7 +718,10 @@ kdb_printit:
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
- printk(KERN_INFO "%s", kdb_buffer);
+ if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+ printk("%s", kdb_buffer);
+ else
+ pr_info("%s", kdb_buffer);
}
if (KDB_STATE(PAGER)) {
@@ -794,11 +804,23 @@ kdb_printit:
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ kdb_printf("\r");
+ kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
+ kdbgetenv("SEARCHPROMPT") ?: "search> ");
+ *strchrnul(kdb_grep_string, '\n') = '\0';
+ kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
+ suspend_grep = 1; /* for this recursion */
} else if (buf1[0] && buf1[0] != '\n') {
/* user hit something other than enter */
suspend_grep = 1; /* for this recursion */
- kdb_printf("\nOnly 'q' or 'Q' are processed at more "
- "prompt, input ignored\n");
+ if (buf1[0] != '/')
+ kdb_printf(
+ "\nOnly 'q', 'Q' or '/' are processed at "
+ "more prompt, input ignored\n");
+ else
+ kdb_printf("\n'/' cannot be used during | "
+ "grep filtering, input ignored\n");
} else if (kdb_grepping_flag) {
/* user hit enter */
suspend_grep = 1; /* for this recursion */
@@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...)
int r;
va_start(ap, fmt);
- r = vkdb_printf(fmt, ap);
+ r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
va_end(ap);
return r;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index f191bddf64b8..4121345498e0 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -50,8 +50,7 @@
static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
-#define GREP_LEN 256
-char kdb_grep_string[GREP_LEN];
+char kdb_grep_string[KDB_GREP_STRLEN];
int kdb_grepping_flag;
EXPORT_SYMBOL(kdb_grepping_flag);
int kdb_grep_leading;
@@ -870,7 +869,7 @@ static void parse_grep(const char *str)
len = strlen(cp);
if (!len)
return;
- if (len >= GREP_LEN) {
+ if (len >= KDB_GREP_STRLEN) {
kdb_printf("search string too long\n");
return;
}
@@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep;
+ int i, escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
*/
cp = (char *)cmdstr;
- kdb_grepping_flag = check_grep = 0;
if (KDB_FLAG(CMD_INTERRUPT)) {
/* Previous command was interrupted, newline must not
@@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to NonMaskable Interrupt @ "
kdb_machreg_fmt "\n",
instruction_pointer(regs));
- kdb_dumpregs(regs);
break;
case KDB_REASON_SSTEP:
case KDB_REASON_BREAK:
@@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
kdb_nextline = 1;
KDB_STATE_CLEAR(SUPPRESS);
+ kdb_grepping_flag = 0;
+ /* ensure the old search does not leak into '/' commands */
+ kdb_grep_string[0] = '\0';
cmdbuf = cmd_cur;
*cmdbuf = '\0';
@@ -2023,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4ld ", module_refcount(mod));
+ kdb_printf("%4d ", module_refcount(mod));
#endif
if (mod->state == MODULE_STATE_GOING)
kdb_printf(" (Unloading)");
@@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
/*
* Validate cpunum
*/
- if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
+ if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
return KDB_BADCPUNUM;
dbg_switch_cpu = cpunum;
@@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv)
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
"Buffers: %8lu kB\n",
- val.totalram, val.freeram, val.bufferram);
+ K(val.totalram), K(val.freeram), K(val.bufferram));
return 0;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index eaacd1693954..75014d7f4568 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
/* Miscellaneous functions and data areas */
extern int kdb_grepping_flag;
+#define KDB_GREPPING_FLAG_SEARCH 0x8000
extern char kdb_grep_string[];
+#define KDB_GREP_STRLEN 256
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
@@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
-extern char *kdb_getstr(char *, size_t, char *);
+extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
/* Defines for kdb_symbol_print */
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = -pg
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..2fabc0627165 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
*/
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- struct list_head *head = this_cpu_ptr(&rotation_list);
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
- list_add(&cpuctx->rotation_list, head);
+ WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+ list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+ WARN_ON(!irqs_disabled());
+
+ WARN_ON(list_empty(&ctx->active_ctx_list));
+
+ list_del_init(&ctx->active_ctx_list);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
}
/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ * perf_remove_from_context();
+ * synchronize_rcu();
+ * perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ * task_struct::perf_event_mutex
+ * perf_event_context::mutex
+ * perf_event_context::lock
+ * perf_event::child_mutex;
+ * perf_event::mmap_mutex
+ * mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+ struct perf_event_context *ctx;
+
+again:
+ rcu_read_lock();
+ ctx = ACCESS_ONCE(event->ctx);
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_nested(&ctx->mutex, nesting);
+ if (event->ctx != ctx) {
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+
+ return ctx;
+}
+
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+ return perf_event_ctx_lock_nested(event, 0);
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+}
+
+/*
* This must be done under the ctx->lock, such as to serialize against
* context_equiv(), therefore we cannot call put_ctx() since that might end up
* calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_branch_stack++;
list_add_rcu(&event->event_entry, &ctx->event_list);
- if (!ctx->nr_events)
- perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
if (group_leader == event)
return;
+ WARN_ON_ONCE(group_leader->ctx != event->ctx);
+
if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
!is_software_event(event))
group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
+
+ WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
{
u64 tstamp = perf_event_time(event);
u64 delta;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
/*
* An event which could not be activated because of
* filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu--;
- ctx->nr_active--;
+ if (!--ctx->nr_active)
+ perf_event_ctx_deactivate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
}
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_disable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_disable);
static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu++;
- ctx->nr_active++;
+ if (!ctx->nr_active++)
+ perf_event_ctx_activate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
@@ -2158,7 +2271,7 @@ unlock:
* perf_event_for_each_child or perf_event_for_each as described
* for perf_event_disable.
*/
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
out:
raw_spin_unlock_irq(&ctx->lock);
}
+
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_enable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
EXPORT_SYMBOL_GPL(perf_event_enable);
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
* not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
return -EINVAL;
atomic_add(refresh, &event->event_limit);
- perf_event_enable(event);
+ _perf_event_enable(event);
return 0;
}
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_refresh(event, refresh);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(perf_event_refresh);
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
-
- /*
- * Since these rotations are per-cpu, we need to ensure the
- * cpu-context we got scheduled on is actually rotating.
- */
- perf_pmu_rotate_start(ctx->pmu);
}
/*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
list_rotate_left(&ctx->flexible_groups);
}
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
- int rotate = 0, remove = 1;
+ int rotate = 0;
if (cpuctx->ctx.nr_events) {
- remove = 0;
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
rotate = 1;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
- remove = 0;
if (ctx->nr_events != ctx->nr_active)
rotate = 1;
}
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:
- if (remove)
- list_del_init(&cpuctx->rotation_list);
return rotate;
}
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&rotation_list);
- struct perf_cpu_context *cpuctx, *tmp;
- struct perf_event_context *ctx;
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
+ struct perf_event_context *ctx, *tmp;
int throttled;
WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
- list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
- ctx = &cpuctx->ctx;
+ list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
-
- ctx = cpuctx->task_ctx;
- if (ctx)
- perf_adjust_freq_unthr_context(ctx, throttled);
- }
}
static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
rcu_read_unlock();
if (owner) {
- mutex_lock(&owner->perf_event_mutex);
+ /*
+ * If we're here through perf_event_exit_task() we're already
+ * holding ctx->mutex which would be an inversion wrt. the
+ * normal lock order.
+ *
+ * However we can safely take this lock because its the child
+ * ctx->mutex.
+ */
+ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
/*
* We have to re-check the event->owner field, if it is cleared
* we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
*/
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
+ struct perf_event_context *ctx;
if (!atomic_long_dec_and_test(&event->refcount))
return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
if (!is_kernel_event(event))
perf_remove_from_owner(event);
- WARN_ON_ONCE(ctx->parent_ctx);
/*
* There are two ways this annotation is useful:
*
@@ -3461,9 +3588,10 @@ static void put_event(struct perf_event *event)
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
*/
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+ WARN_ON_ONCE(ctx->parent_ctx);
perf_remove_from_context(event, true);
- mutex_unlock(&ctx->mutex);
+ perf_event_ctx_unlock(event, ctx);
_free_event(event);
}
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
- int n = 0, size = 0, ret = -EFAULT;
struct perf_event_context *ctx = leader->ctx;
- u64 values[5];
+ int n = 0, size = 0, ret;
u64 count, enabled, running;
+ u64 values[5];
+
+ lockdep_assert_held(&ctx->mutex);
- mutex_lock(&ctx->mutex);
count = perf_event_read_value(leader, &enabled, &running);
values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf, values, size))
- goto unlock;
+ return -EFAULT;
ret = size;
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
size = n * sizeof(u64);
if (copy_to_user(buf + ret, values, size)) {
- ret = -EFAULT;
- goto unlock;
+ return -EFAULT;
}
ret += size;
}
-unlock:
- mutex_unlock(&ctx->mutex);
return ret;
}
@@ -3660,8 +3786,14 @@ static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = perf_read_hw(event, buf, count);
+ perf_event_ctx_unlock(event, ctx);
- return perf_read_hw(event, buf, count);
+ return ret;
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
return events;
}
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
{
(void)perf_event_read(event);
local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
struct perf_event *child;
WARN_ON_ONCE(event->ctx->parent_ctx);
+
mutex_lock(&event->child_mutex);
func(event);
list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
struct perf_event_context *ctx = event->ctx;
struct perf_event *sibling;
- WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
+ lockdep_assert_held(&ctx->mutex);
+
event = event->group_leader;
perf_event_for_each_child(event, func);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
perf_event_for_each_child(sibling, func);
- mutex_unlock(&ctx->mutex);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
- struct perf_event *event = file->private_data;
void (*func)(struct perf_event *);
u32 flags = arg;
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
- func = perf_event_enable;
+ func = _perf_event_enable;
break;
case PERF_EVENT_IOC_DISABLE:
- func = perf_event_disable;
+ func = _perf_event_disable;
break;
case PERF_EVENT_IOC_RESET:
- func = perf_event_reset;
+ func = _perf_event_reset;
break;
case PERF_EVENT_IOC_REFRESH:
- return perf_event_refresh(event, arg);
+ return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return 0;
}
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ long ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_ioctl(event, cmd, arg);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
int perf_event_task_enable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_enable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_enable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
int perf_event_task_disable(void)
{
+ struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
- list_for_each_entry(event, &current->perf_event_list, owner_entry)
- perf_event_for_each_child(event, perf_event_disable);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_disable);
+ perf_event_ctx_unlock(event, ctx);
+ }
mutex_unlock(&current->perf_event_mutex);
return 0;
@@ -3949,7 +4101,8 @@ unlock:
rcu_read_unlock();
}
-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+ struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3999,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
- arch_perf_update_userpage(userpg, now);
+ arch_perf_update_userpage(event, userpg, now);
barrier();
++userpg->lock;
@@ -4141,6 +4294,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
}
/*
@@ -4160,6 +4316,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4361,6 +4520,9 @@ unlock:
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
return ret;
}
@@ -4412,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry)
{
struct perf_event *event = container_of(entry,
struct perf_event, pending);
+ int rctx;
+
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
if (event->pending_disable) {
event->pending_disable = 0;
@@ -4422,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry)
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
}
/*
@@ -5889,6 +6061,8 @@ end:
rcu_read_unlock();
}
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
int perf_swevent_get_recursion_context(void)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +6078,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
put_recursion_context(swhash->recursion, rctx);
}
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
- int rctx;
- preempt_disable_notrace();
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
+ if (WARN_ON_ONCE(!regs))
return;
perf_sample_data_init(&data, addr, 0);
-
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
perf_swevent_put_recursion_context(rctx);
+fail:
preempt_enable_notrace();
}
@@ -6776,12 +6959,10 @@ skip_type:
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
__perf_cpu_hrtimer_init(cpuctx, cpu);
- INIT_LIST_HEAD(&cpuctx->rotation_list);
cpuctx->unique_pmu = pmu;
}
@@ -6854,6 +7035,20 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+ int ret;
+
+ if (!try_module_get(pmu->module))
+ return -ENODEV;
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+ if (ret)
+ module_put(pmu->module);
+
+ return ret;
+}
+
struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
@@ -6866,24 +7061,14 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (ret)
pmu = ERR_PTR(ret);
goto unlock;
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (!try_module_get(pmu->module)) {
- pmu = ERR_PTR(-ENODEV);
- goto unlock;
- }
- event->pmu = pmu;
- ret = pmu->event_init(event);
+ ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
@@ -7247,6 +7432,15 @@ out:
return ret;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7262,7 +7456,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx;
+ struct perf_event_context *ctx, *uninitialized_var(gctx);
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -7420,7 +7614,19 @@ SYSCALL_DEFINE5(perf_event_open,
* task or CPU context:
*/
if (move_group) {
- if (group_leader->ctx->type != ctx->type)
+ /*
+ * Make sure we're both on the same task, or both
+ * per-cpu events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
+ */
+ if (group_leader->cpu != event->cpu)
goto err_context;
} else {
if (group_leader->ctx != ctx)
@@ -7448,43 +7654,68 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- struct perf_event_context *gctx = group_leader->ctx;
-
- mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader, false);
+ gctx = group_leader->ctx;
/*
- * Removing from the context ends up with disabled
- * event. What we want here is event in the initial
- * startup state, ready to be add into new context.
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
*/
- perf_event__state_init(group_leader);
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ perf_remove_from_context(group_leader, false);
+
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling, false);
- perf_event__state_init(sibling);
put_ctx(gctx);
}
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
+ } else {
+ mutex_lock(&ctx->mutex);
}
WARN_ON_ONCE(ctx->parent_ctx);
- mutex_lock(&ctx->mutex);
if (move_group) {
+ /*
+ * Wait for everybody to stop referencing the events through
+ * the old lists, before installing it on new lists.
+ */
synchronize_rcu();
- perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
+
+ /*
+ * Install the group siblings before the group leader.
+ *
+ * Because a group leader will try and install the entire group
+ * (through the sibling list, which is still in-tact), we can
+ * end up with siblings installed in the wrong context.
+ *
+ * By installing siblings first we NO-OP because they're not
+ * reachable through the group lists.
+ */
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
+ perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
+ get_ctx(ctx);
}
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
+
+ if (move_group) {
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ }
mutex_unlock(&ctx->mutex);
put_online_cpus();
@@ -7592,7 +7823,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
- mutex_lock(&src_ctx->mutex);
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
perf_remove_from_context(event, false);
@@ -7600,11 +7835,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
}
- mutex_unlock(&src_ctx->mutex);
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
synchronize_rcu();
- mutex_lock(&dst_ctx->mutex);
+ /*
+ * Re-instate events in 2 passes.
+ *
+ * Skip over group leaders and only install siblings on this first
+ * pass, siblings will not get enabled without a leader, however a
+ * leader will enable its siblings, even if those are still on the old
+ * context.
+ */
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ if (event->group_leader == event)
+ continue;
+
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+
+ /*
+ * Once all the siblings are setup properly, install the group leaders
+ * to make it go.
+ */
list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7614,6 +7874,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
get_ctx(dst_ctx);
}
mutex_unlock(&dst_ctx->mutex);
+ mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
@@ -7800,14 +8061,19 @@ static void perf_free_event(struct perf_event *event,
put_event(parent);
+ raw_spin_lock_irq(&ctx->lock);
perf_group_detach(event);
list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
free_event(event);
}
/*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
* perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -8126,7 +8392,7 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+ INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
}
}
@@ -8147,22 +8413,11 @@ static void perf_event_init_cpu(int cpu)
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- WARN_ON(!irqs_disabled());
-
- list_del_init(&cpuctx->rotation_list);
-}
-
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
- perf_pmu_rotate_stop(ctx->pmu);
-
rcu_read_lock();
list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
__perf_remove_from_context(&re);
@@ -8273,6 +8528,18 @@ void __init perf_event_init(void)
!= 1024);
}
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s\n", pmu_attr->event_str);
+
+ return 0;
+}
+
static int __init perf_event_sysfs_init(void)
{
struct pmu *pmu;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/circ_buf.h>
+#include <linux/poll.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, POLL_IN);
+ atomic_set(&handle->rb->poll, POLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
- clear_thread_flag(TIF_MEMDIE);
+ if (test_thread_flag(TIF_MEMDIE))
+ unmark_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
- if (unlikely(tmp->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(tmp,
- &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm_nr_pmds_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
+
+ if (atomic_long_read(&mm->nr_ptes))
+ pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
+ atomic_long_read(&mm->nr_ptes));
+ if (mm_nr_pmds(mm))
+ pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
+ mm_nr_pmds(mm));
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2579e407ff67 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
if (!p)
return -ESRCH;
- if (!p->mm) {
+ if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
return -EPERM;
}
@@ -2217,7 +2217,7 @@ retry:
if (!abs_time)
goto out;
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
- return futex_lock_pi(uaddr, flags, val, timeout, 0);
+ return futex_lock_pi(uaddr, flags, timeout, 0);
case FUTEX_UNLOCK_PI:
return futex_unlock_pi(uaddr, flags);
case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+ return futex_lock_pi(uaddr, flags, NULL, 1);
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-# if-lt
-# Usage VAR := $(call if-lt, $(a), $(b))
-# Returns 1 if (a < b)
-if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
-
-ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
- cc-ver := 0304
-else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
- cc-ver := 0407
-else
-# Use cc-version if available, otherwise set 0
-#
-# scripts/Kbuild.include, which contains cc-version function, is not included
-# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
-# Meaning cc-ver is empty causing if-lt test to fail with
-# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
-# This has no affect on the clean phase, but the error message could be
-# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
-# is not available. We can probably move if-lt to Kbuild.include, so it's also
-# not defined during clean or to include Kbuild.include in
-# scripts/Makefile.clean. But the following workaround seems least invasive.
- cc-ver := $(if $(call cc-version),$(call cc-version),0)
-endif
-
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
-
-ifeq ($(call if-lt, $(cc-ver), 0407),1)
- obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
-else
- obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
-endif
+obj-y := base.o fs.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
+ gcc_3_4.o, gcc_4_7.o)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6f1c7a566b95..eb9a4ea394ab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
return -ENOSYS;
}
+
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @on: Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_wake)
+ return data->chip->irq_set_wake(data, on);
+
+ return -ENOSYS;
+}
#endif
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..e68932bb308e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Do not use this for shutdown scenarios where you must be sure
* that all parts (hardirq and threaded handler) have completed.
*
+ * Returns: false if a threaded handler is active.
+ *
* This function may be called - with care - from IRQ context.
*/
-void synchronize_hardirq(unsigned int irq)
+bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc)
+ if (desc) {
__synchronize_hardirq(desc);
+ return !atomic_read(&desc->threads_active);
+ }
+
+ return true;
}
EXPORT_SYMBOL(synchronize_hardirq);
@@ -243,6 +249,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
+ /* set the initial affinity to prevent every interrupt being on CPU0 */
+ if (m)
+ __irq_set_affinity(irq, m, false);
return 0;
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
@@ -437,6 +446,32 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
+/**
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this function while
+ * holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ * When used to optimistically disable an interrupt from atomic context
+ * the return value must be checked.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+ if (!__disable_irq_nosync(irq))
+ return synchronize_hardirq(irq);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
+
void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
switch (desc->depth) {
@@ -1471,8 +1506,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
+ *
+ * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
+ * it cannot be set along with IRQF_NO_SUSPEND.
*/
- if ((irqflags & IRQF_SHARED) && !dev_id)
+ if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
+ ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
desc = irq_to_desc(irq);
@@ -1758,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+
+/**
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be storeed
+ *
+ * This call snapshots the internal irqchip state of an
+ * interrupt, returning into @state the bit corresponding to
+ * stage @which
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt,
+ * depending on the value of @which.
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool val)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_set_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_set_irqchip_state(data, which, val);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3e18163f336f..474de5cb394d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
struct msi_desc *desc;
for_each_msi_entry(desc, dev) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /*
+ * We might have failed to allocate an MSI early
+ * enough that there is no IRQ associated to this
+ * entry. If that's the case, don't do anything.
+ */
+ if (desc->irq) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
}
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 3ca532592704..5204a6d1b985 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
if (action->flags & IRQF_NO_SUSPEND)
desc->no_suspend_depth++;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
- desc->no_suspend_depth != desc->nr_actions);
+ (desc->no_suspend_depth +
+ desc->cond_suspend_depth) != desc->nr_actions);
}
/*
@@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
if (action->flags & IRQF_NO_SUSPEND)
desc->no_suspend_depth--;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth--;
}
static bool suspend_device_irq(struct irq_desc *desc, int irq)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
mask = desc->pending_mask;
#endif
if (type)
- seq_cpumask_list(m, mask);
+ seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
else
- seq_cpumask(m, mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
return 0;
}
@@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
cpumask_copy(mask, desc->affinity_hint);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- seq_cpumask(m, mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
return 0;
@@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
static int default_affinity_show(struct seq_file *m, void *v)
{
- seq_cpumask(m, irq_default_affinity);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
return 0;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
}
/*
- * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
* been loaded into separate segments and have been copied elsewhere.
*/
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
destination &= PAGE_MASK;
result = kimage_add_entry(image, destination | IND_DESTINATION);
- if (result == 0)
- image->destination = destination;
return result;
}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
page &= PAGE_MASK;
result = kimage_add_entry(image, page | IND_SOURCE);
- if (result == 0)
- image->destination += PAGE_SIZE;
return result;
}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > 0) {
unsigned long i;
- /* Loading another kernel to reboot into */
- if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- /* Loading another kernel to switch to if this one crashes */
- else if (flags & KEXEC_ON_CRASH) {
- /* Free any current crash dump kernel before
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
* we corrupt it.
*/
+
kimage_free(xchg(&kexec_crash_image, NULL));
result = kimage_alloc_init(&image, entry, nr_segments,
segments, flags);
crash_map_reserved_pages();
+ } else {
+ /* Loading another kernel to reboot into. */
+
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
}
if (result)
goto out;
@@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image)
continue;
/*
- * Respective archicture needs to provide support for applying
+ * Respective architecture needs to provide support for applying
* relocations of type SHT_RELA/SHT_REL.
*/
if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
static void free_insn_page(void *page)
{
- module_free(NULL, page);
+ module_memfree(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- arch_prepare_optimized_kprobe(op);
+ arch_prepare_optimized_kprobe(op, p);
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
- arch_prepare_optimized_kprobe(op);
+ arch_prepare_optimized_kprobe(op, p);
return &op->kp;
}
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
- unoptimize_kprobe(p, false); /* Try to unoptimize */
+ /* Try to unoptimize */
+ unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
@@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- disarm_kprobe(orig_p, true);
+ /*
+ * If kprobes_all_disarmed is set, orig_p
+ * should have already been disarmed, so
+ * skip unneed disarming process.
+ */
+ if (!kprobes_all_disarmed)
+ disarm_kprobe(orig_p, true);
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
@@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void)
if (!kprobes_all_disarmed)
goto already_enabled;
+ /*
+ * optimize_kprobe() called by arm_kprobe() checks
+ * kprobes_all_disarmed, so set kprobes_all_disarmed before
+ * arm_kprobe.
+ */
+ kprobes_all_disarmed = false;
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
@@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void)
arm_kprobe(p);
}
- kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
already_enabled:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config HAVE_LIVEPATCH
+ bool
+ help
+ Arch supports kernel live patching
+
+config LIVEPATCH
+ bool "Kernel Live Patching"
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on MODULES
+ depends on SYSFS
+ depends on KALLSYMS_ALL
+ depends on HAVE_LIVEPATCH
+ help
+ Say Y here if you want to support kernel live patching.
+ This option has no runtime impact until a kernel "patch"
+ module uses the interface provided by this option to register
+ a patch, causing calls to patched functions to be redirected
+ to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..284e2691e380
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1003 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them. References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
+static DEFINE_MUTEX(klp_mutex);
+
+static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
+
+static struct kobject *klp_root_kobj;
+
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_addr == old_addr)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static bool klp_is_module(struct klp_object *obj)
+{
+ return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+ struct module *mod;
+
+ if (!klp_is_module(obj))
+ return;
+
+ mutex_lock(&module_mutex);
+ /*
+ * We do not want to block removal of patched modules and therefore
+ * we do not take a reference here. The patches are removed by
+ * a going module handler instead.
+ */
+ mod = find_module(obj->name);
+ /*
+ * Do not mess work of the module coming and going notifiers.
+ * Note that the patch might still be needed before the going handler
+ * is called. Module functions can be called even in the GOING state
+ * until mod->exit() finishes. This is especially important for
+ * patches that modify semantic of the functions.
+ */
+ if (mod && mod->klp_alive)
+ obj->mod = mod;
+
+ mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+ struct klp_patch *mypatch;
+
+ list_for_each_entry(mypatch, &klp_patches, list)
+ if (mypatch == patch)
+ return true;
+
+ return false;
+}
+
+static bool klp_initialized(void)
+{
+ return klp_root_kobj;
+}
+
+struct klp_find_arg {
+ const char *objname;
+ const char *name;
+ unsigned long addr;
+ /*
+ * If count == 0, the symbol was not found. If count == 1, a unique
+ * match was found and addr is set. If count > 1, there is
+ * unresolvable ambiguity among "count" number of symbols with the same
+ * name in the same object.
+ */
+ unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ /*
+ * args->addr might be overwritten if another match is found
+ * but klp_find_object_symbol() handles this and only returns the
+ * addr if count == 1.
+ */
+ args->addr = addr;
+ args->count++;
+
+ return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+ unsigned long *addr)
+{
+ struct klp_find_arg args = {
+ .objname = objname,
+ .name = name,
+ .addr = 0,
+ .count = 0
+ };
+
+ kallsyms_on_each_symbol(klp_find_callback, &args);
+
+ if (args.count == 0)
+ pr_err("symbol '%s' not found in symbol table\n", name);
+ else if (args.count > 1)
+ pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+ args.count, name, objname);
+ else {
+ *addr = args.addr;
+ return 0;
+ }
+
+ *addr = 0;
+ return -EINVAL;
+}
+
+struct klp_verify_args {
+ const char *name;
+ const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_verify_args *args = data;
+
+ if (!mod &&
+ !strcmp(args->name, name) &&
+ args->addr == addr)
+ return 1;
+
+ return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+ struct klp_verify_args args = {
+ .name = name,
+ .addr = addr,
+ };
+
+ if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+ return 0;
+
+ pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+ name, addr);
+ return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+ struct klp_func *func)
+{
+ int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+ /* KASLR is enabled, disregard old_addr from user */
+ func->old_addr = 0;
+#endif
+
+ if (!func->old_addr || klp_is_module(obj))
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ &func->old_addr);
+ else
+ ret = klp_verify_vmlinux_symbol(func->old_name,
+ func->old_addr);
+
+ return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+ unsigned long *addr)
+{
+ const struct kernel_symbol *sym;
+
+ /* first, check if it's an exported symbol */
+ preempt_disable();
+ sym = find_symbol(name, NULL, NULL, true, true);
+ if (sym) {
+ *addr = sym->value;
+ preempt_enable();
+ return 0;
+ }
+ preempt_enable();
+
+ /* otherwise check if it's in another .o within the patch module */
+ return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+ struct klp_object *obj)
+{
+ int ret;
+ struct klp_reloc *reloc;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ if (WARN_ON(!obj->relocs))
+ return -EINVAL;
+
+ for (reloc = obj->relocs; reloc->name; reloc++) {
+ if (!klp_is_module(obj)) {
+ ret = klp_verify_vmlinux_symbol(reloc->name,
+ reloc->val);
+ if (ret)
+ return ret;
+ } else {
+ /* module, reloc->val needs to be discovered */
+ if (reloc->external)
+ ret = klp_find_external_symbol(pmod,
+ reloc->name,
+ &reloc->val);
+ else
+ ret = klp_find_object_symbol(obj->mod->name,
+ reloc->name,
+ &reloc->val);
+ if (ret)
+ return ret;
+ }
+ ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+ reloc->val + reloc->addend);
+ if (ret) {
+ pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+ reloc->name, reloc->val, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct pt_regs *regs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ rcu_read_lock();
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (WARN_ON_ONCE(!func))
+ goto unlock;
+
+ klp_arch_set_pc(regs, (unsigned long)func->new_func);
+unlock:
+ rcu_read_unlock();
+}
+
+static void klp_disable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+
+ WARN_ON(func->state != KLP_ENABLED);
+ WARN_ON(!func->old_addr);
+
+ ops = klp_find_ops(func->old_addr);
+ if (WARN_ON(!ops))
+ return;
+
+ if (list_is_singular(&ops->func_stack)) {
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->state = KLP_DISABLED;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ if (WARN_ON(func->state != KLP_DISABLED))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (!ops) {
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_IPMODIFY;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->state = KLP_ENABLED;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+static void klp_disable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name; func++)
+ if (func->state == KLP_ENABLED)
+ klp_disable_func(func);
+
+ obj->state = KLP_DISABLED;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->state != KLP_DISABLED))
+ return -EINVAL;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_enable_func(func);
+ if (ret) {
+ klp_disable_object(obj);
+ return ret;
+ }
+ }
+ obj->state = KLP_ENABLED;
+
+ return 0;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ /* enforce stacking: only the last enabled patch can be disabled */
+ if (!list_is_last(&patch->list, &klp_patches) &&
+ list_next_entry(patch, list)->state == KLP_ENABLED)
+ return -EBUSY;
+
+ pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (obj->state == KLP_ENABLED)
+ klp_disable_object(obj);
+ }
+
+ patch->state = KLP_DISABLED;
+
+ return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch: The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (patch->state == KLP_DISABLED) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_disable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (WARN_ON(patch->state != KLP_DISABLED))
+ return -EINVAL;
+
+ /* enforce stacking: only the first disabled patch can be enabled */
+ if (patch->list.prev != &klp_patches &&
+ list_prev_entry(patch, list)->state == KLP_DISABLED)
+ return -EBUSY;
+
+ pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+ add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_enable_object(obj);
+ if (ret)
+ goto unregister;
+ }
+
+ patch->state = KLP_ENABLED;
+
+ return 0;
+
+unregister:
+ WARN_ON(__klp_disable_patch(patch));
+ return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch: The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_enable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ if (val != KLP_DISABLED && val != KLP_ENABLED)
+ return -EINVAL;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ if (val == patch->state) {
+ /* already in requested state */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (val == KLP_ENABLED) {
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+ } else {
+ ret = __klp_disable_patch(patch);
+ if (ret)
+ goto err;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+ &enabled_kobj_attr.attr,
+ NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+ /*
+ * Once we have a consistency model we'll need to module_put() the
+ * patch module here. See klp_register_patch() for more details.
+ */
+}
+
+static struct kobj_type klp_ktype_patch = {
+ .release = klp_kobj_release_patch,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_func = {
+ .release = klp_kobj_release_func,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+ struct klp_func *limit)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name && func != limit; func++)
+ kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ obj->mod = NULL;
+
+ for (func = obj->funcs; func->old_name; func++)
+ func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+ struct klp_object *limit)
+{
+ struct klp_object *obj;
+
+ for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+ klp_free_funcs_limited(obj, NULL);
+ kobject_put(obj->kobj);
+ }
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+ klp_free_objects_limited(patch, NULL);
+ if (!list_empty(&patch->list))
+ list_del(&patch->list);
+ kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+ INIT_LIST_HEAD(&func->stack_node);
+ func->state = KLP_DISABLED;
+
+ return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ obj->kobj, "%s", func->old_name);
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (obj->relocs) {
+ ret = klp_write_object_relocations(patch->mod, obj);
+ if (ret)
+ return ret;
+ }
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_find_verify_func_addr(obj, func);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+ const char *name;
+
+ if (!obj->funcs)
+ return -EINVAL;
+
+ obj->state = KLP_DISABLED;
+ obj->mod = NULL;
+
+ klp_find_object_module(obj);
+
+ name = klp_is_module(obj) ? obj->name : "vmlinux";
+ obj->kobj = kobject_create_and_add(name, &patch->kobj);
+ if (!obj->kobj)
+ return -ENOMEM;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_init_func(obj, func);
+ if (ret)
+ goto free;
+ }
+
+ if (klp_is_object_loaded(obj)) {
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ return 0;
+
+free:
+ klp_free_funcs_limited(obj, func);
+ kobject_put(obj->kobj);
+ return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (!patch->objs)
+ return -EINVAL;
+
+ mutex_lock(&klp_mutex);
+
+ patch->state = KLP_DISABLED;
+
+ ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+ klp_root_kobj, "%s", patch->mod->name);
+ if (ret)
+ goto unlock;
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ ret = klp_init_object(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ list_add_tail(&patch->list, &klp_patches);
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+free:
+ klp_free_objects_limited(patch, obj);
+ kobject_put(&patch->kobj);
+unlock:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch: Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+ int ret = 0;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (patch->state == KLP_ENABLED) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ klp_free_patch(patch);
+
+out:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch: Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ if (!klp_initialized())
+ return -ENODEV;
+
+ if (!patch || !patch->mod)
+ return -EINVAL;
+
+ /*
+ * A reference is taken on the patch module to prevent it from being
+ * unloaded. Right now, we don't allow patch modules to unload since
+ * there is currently no method to determine if a thread is still
+ * running in the patched code contained in the patch module once
+ * the ftrace registration is successful.
+ */
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ ret = klp_init_patch(patch);
+ if (ret)
+ module_put(patch->mod);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+ int ret;
+
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto err;
+
+ if (patch->state == KLP_DISABLED)
+ return;
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ pmod->name, mod->name);
+
+ ret = klp_enable_object(obj);
+ if (!ret)
+ return;
+
+err:
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+
+ if (patch->state == KLP_DISABLED)
+ goto disabled;
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ pmod->name, mod->name);
+
+ klp_disable_object(obj);
+
+disabled:
+ klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct module *mod = data;
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+ return 0;
+
+ mutex_lock(&klp_mutex);
+
+ /*
+ * Each module has to know that the notifier has been called.
+ * We never know what module will get patched by a new patch.
+ */
+ if (action == MODULE_STATE_COMING)
+ mod->klp_alive = true;
+ else /* MODULE_STATE_GOING */
+ mod->klp_alive = false;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ if (action == MODULE_STATE_COMING) {
+ obj->mod = mod;
+ klp_module_notify_coming(patch, obj);
+ } else /* MODULE_STATE_GOING */
+ klp_module_notify_going(patch, obj);
+
+ break;
+ }
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+ .notifier_call = klp_module_notify,
+ .priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+ int ret;
+
+ ret = klp_check_compiler_support();
+ if (ret) {
+ pr_info("Your compiler is too old; turning off.\n");
+ return -EINVAL;
+ }
+
+ ret = register_module_notifier(&klp_module_nb);
+ if (ret)
+ return ret;
+
+ klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+ if (!klp_root_kobj) {
+ ret = -ENOMEM;
+ goto unregister;
+ }
+
+ return 0;
+
+unregister:
+ unregister_module_notifier(&klp_module_nb);
+ return ret;
+}
+
+module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_lockdep.o = -pg
-CFLAGS_REMOVE_lockdep_proc.o = -pg
-CFLAGS_REMOVE_mutex-debug.o = -pg
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d4420ad2..ba77ab5f64dd 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
hash_head = classhashentry(key);
/*
- * We can walk the hash lockfree, because the hash only
- * grows, and we are careful when adding entries to the end:
+ * We do an RCU walk of the hash, see lockdep_free_key_range().
*/
- list_for_each_entry(class, hash_head, hash_entry) {
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct list_head *hash_head;
struct lock_class *class;
- unsigned long flags;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
if (likely(class))
@@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
key = lock->key->subkeys + subclass;
hash_head = classhashentry(key);
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
/*
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry(class, hash_head, hash_entry)
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
+ }
+
/*
* Allocate a new key from the static array, and add it to
* the hash:
*/
if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
if (!debug_locks_off_graph_unlock()) {
- raw_local_irq_restore(flags);
return NULL;
}
- raw_local_irq_restore(flags);
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
@@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (verbose(class)) {
graph_unlock();
- raw_local_irq_restore(flags);
printk("\nnew class %p: %s", class->key, class->name);
if (class->name_version > 1)
@@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
printk("\n");
dump_stack();
- raw_local_irq_save(flags);
if (!graph_lock()) {
- raw_local_irq_restore(flags);
return NULL;
}
}
out_unlock_set:
graph_unlock();
- raw_local_irq_restore(flags);
out_set_class_cache:
if (!subclass || force)
@@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
entry->distance = distance;
entry->trace = *trace;
/*
- * Since we never remove from the dependency list, the list can
- * be walked lockless by other CPUs, it's only allocation
- * that must be protected by the spinlock. But this also means
- * we must make new entries visible only once writes to the
- * entry become visible - hence the RCU op:
+ * Both allocation and removal are done under the graph lock; but
+ * iteration is under RCU-sched; see look_up_lock_class() and
+ * lockdep_free_key_range().
*/
list_add_tail_rcu(&entry->entry, head);
@@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry,
else
head = &lock->class->locks_before;
- list_for_each_entry(entry, head, entry) {
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ list_for_each_entry_rcu(entry, head, entry) {
if (!lock_accessed(entry)) {
unsigned int cq_depth;
mark_lock_accessed(entry, lock);
@@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ list_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (unlikely(!debug_locks))
return;
- if (subclass)
+ if (subclass) {
+ unsigned long flags;
+
+ if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
register_lock_class(lock, subclass, 1);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+ }
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
void lockdep_free_key_range(void *start, unsigned long size)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i;
@@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size)
if (locked)
graph_unlock();
raw_local_irq_restore(flags);
+
+ /*
+ * Wait for any possible iterators from look_up_lock_class() to pass
+ * before continuing to free the memory they refer to.
+ *
+ * sync_sched() is sufficient because the read-side is IRQ disable.
+ */
+ synchronize_sched();
+
+ /*
+ * XXX at this point we could return the resources to the pool;
+ * instead we leak them. We would need to change to bitmap allocators
+ * instead of the linear allocators we have now.
+ */
}
void lockdep_reset_lock(struct lockdep_map *lock)
{
- struct lock_class *class, *next;
+ struct lock_class *class;
struct list_head *head;
unsigned long flags;
int i, j;
@@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
head = classhash_table + i;
if (list_empty(head))
continue;
- list_for_each_entry_safe(class, next, head, hash_entry) {
+ list_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..75e114bdf3f2 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
*/
return;
}
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
arch_mcs_spin_lock_contended(&node->locked);
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
static inline
void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+ struct mcs_spinlock *next = READ_ONCE(node->next);
if (likely(!next)) {
/*
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
+ while (!(next = READ_ONCE(node->next)))
cpu_relax_lowlatency();
}
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mcs_spin_unlock_contended(&next->locked);
}
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-
-struct optimistic_spin_node {
- struct optimistic_spin_node *next, *prev;
- int locked; /* 1 if lock acquired */
- int cpu; /* encoded CPU # value */
-};
-
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
-
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..4cccea6b8934 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
* The mutex must later on be released by the same task that
* acquired it. Recursive locking is not allowed. The task
* may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
* the mutex still locked. The mutex must first be initialized
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
}
/*
- * after acquiring lock with fastpath or when we lost out in contested
+ * After acquiring lock with fastpath or when we lost out in contested
* slowpath, set ctx and wake up any waiters so they can recheck.
*
* This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,57 +191,61 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
spin_unlock_mutex(&lock->base.wait_lock, flags);
}
-
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
- * more or less simultaneously, the spinners need to acquire a MCS lock
- * first before spinning on the owner field.
+ * After acquiring lock in the slowpath set ctx and wake up any
+ * waiters so they can recheck.
*
+ * Callers must hold the mutex wait_lock.
*/
-
-/*
- * Mutex spinning code migrated from kernel/sched/core.c
- */
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
{
- if (lock->owner != owner)
- return false;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+ lock->ctx = ctx;
/*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
*/
- barrier();
-
- return owner->on_cpu;
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* Look out! "owner" is an entirely speculative pointer
* access and not reliable.
*/
static noinline
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
{
+ bool ret = true;
+
rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
break;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
+ return ret;
}
/*
@@ -256,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
return 0;
rcu_read_lock();
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner)
retval = owner->on_cpu;
rcu_read_unlock();
@@ -307,6 +311,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
if (!mutex_can_spin_on_owner(lock))
goto done;
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
if (!osq_lock(&lock->osq))
goto done;
@@ -325,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* As such, when deadlock detection needs to be
* performed the optimistic spinning cannot be done.
*/
- if (ACCESS_ONCE(ww->ctx))
+ if (READ_ONCE(ww->ctx))
break;
}
@@ -333,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- owner = ACCESS_ONCE(lock->owner);
+ owner = READ_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
break;
@@ -469,10 +478,10 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
EXPORT_SYMBOL(ww_mutex_unlock);
static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
{
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
if (!hold_ctx)
return 0;
@@ -557,7 +566,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
if (use_ww_ctx && ww_ctx->acquired > 0) {
- ret = __mutex_lock_check_stamp(lock, ww_ctx);
+ ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
if (ret)
goto err;
}
@@ -569,6 +578,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
schedule_preempt_disabled();
spin_lock_mutex(&lock->wait_lock, flags);
}
+ __set_task_state(task, TASK_RUNNING);
+
mutex_remove_waiter(lock, &waiter, current_thread_info());
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +593,7 @@ skip_wait:
if (use_ww_ctx) {
struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct mutex_waiter *cur;
-
- /*
- * This branch gets optimized out for the common case,
- * and is only important for ww_mutex_lock.
- */
- ww_mutex_lock_acquired(ww, ww_ctx);
- ww->ctx = ww_ctx;
-
- /*
- * Give any possible sleeping processes the chance to wake up,
- * so they can recheck if they have to back off.
- */
- list_for_each_entry(cur, &lock->wait_list, list) {
- debug_mutex_wake_waiter(lock, cur);
- wake_up_process(cur->task);
- }
+ ww_mutex_set_context_slowpath(ww, ww_ctx);
}
spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..dc85ee23a26f 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
#include <linux/percpu.h>
#include <linux/sched.h>
-#include "mcs_spinlock.h"
-
-#ifdef CONFIG_SMP
+#include <linux/osq_lock.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -100,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
- ACCESS_ONCE(prev->next) = node;
+ WRITE_ONCE(prev->next, node);
/*
* Normally @prev is untouchable after the above store; because at that
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!smp_load_acquire(&node->locked)) {
+ while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
*/
@@ -150,7 +148,7 @@ unqueue:
* Or we race against a concurrent unqueue()'s step-B, in which
* case its step-C will write us a new @node->prev pointer.
*/
- prev = ACCESS_ONCE(node->prev);
+ prev = READ_ONCE(node->prev);
}
/*
@@ -172,8 +170,8 @@ unqueue:
* it will wait in Step-A.
*/
- ACCESS_ONCE(next->prev) = prev;
- ACCESS_ONCE(prev->next) = next;
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
return false;
}
@@ -195,14 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
return;
}
next = osq_wait_next(lock, node, NULL);
if (next)
- ACCESS_ONCE(next->locked) = 1;
+ WRITE_ONCE(next->locked, 1);
}
-
-#endif
-
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..b73279367087 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
*
* @task: the task owning the mutex (owner) for which a chain walk is
* probably needed
- * @deadlock_detect: do we have to carry out deadlock detection?
+ * @chwalk: do we have to carry out deadlock detection?
* @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
* things for a task that has just got its priority adjusted, and
* is waiting on a mutex)
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
}
+ __set_current_state(TASK_RUNNING);
return ret;
}
@@ -1188,12 +1189,13 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
if (likely(!ret))
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret)) {
- remove_waiter(lock, &waiter);
+ __set_current_state(TASK_RUNNING);
+ if (rt_mutex_has_waiters(lock))
+ remove_waiter(lock, &waiter);
rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
@@ -1626,10 +1628,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
+ /* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- set_current_state(TASK_RUNNING);
-
if (unlikely(ret))
remove_waiter(lock, waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..3a5048572065 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
list_del(&waiter->list);
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -154,7 +161,7 @@ void __sched __down_read(struct rw_semaphore *sem)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
}
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
out:
;
}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..3417d0172a5d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -14,8 +14,9 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
-#include "mcs_spinlock.h"
+#include "rwsem.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
smp_mb();
waiter->task = NULL;
wake_up_process(tsk);
@@ -242,8 +250,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
schedule();
}
- tsk->state = TASK_RUNNING;
-
+ __set_task_state(tsk, TASK_RUNNING);
return sem;
}
EXPORT_SYMBOL(rwsem_down_read_failed);
@@ -259,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
if (!list_is_singular(&sem->wait_list))
rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ rwsem_set_owner(sem);
return true;
}
@@ -271,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = ACCESS_ONCE(sem->count);
+ long old, count = READ_ONCE(sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count)
+ if (old == count) {
+ rwsem_set_owner(sem);
return true;
+ }
count = old;
}
@@ -288,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = false;
+ bool ret = true;
if (need_resched())
return false;
rcu_read_lock();
- owner = ACCESS_ONCE(sem->owner);
- if (owner)
- on_cpu = owner->on_cpu;
- rcu_read_unlock();
-
- /*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath, then there is a possibility reader(s) may have the lock.
- * To be safe, avoid spinning in these situations.
- */
- return on_cpu;
-}
-
-static inline bool owner_running(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- if (sem->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * sem->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
+ owner = READ_ONCE(sem->owner);
+ if (!owner) {
+ long count = READ_ONCE(sem->count);
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath with the lock being active, then there is a possibility
+ * reader(s) may have the lock. To be safe, bail spinning in these
+ * situations.
+ */
+ if (count & RWSEM_ACTIVE_MASK)
+ ret = false;
+ goto done;
+ }
- return owner->on_cpu;
+ ret = owner->on_cpu;
+done:
+ rcu_read_unlock();
+ return ret;
}
static noinline
bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
{
+ long count;
+
rcu_read_lock();
- while (owner_running(sem, owner)) {
- if (need_resched())
- break;
+ while (sem->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /* abort spinning when need_resched or owner is not running */
+ if (!owner->on_cpu || need_resched()) {
+ rcu_read_unlock();
+ return false;
+ }
cpu_relax_lowlatency();
}
rcu_read_unlock();
+ if (READ_ONCE(sem->owner))
+ return true; /* new owner, continue spinning */
+
/*
- * We break out the loop above on need_resched() or when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when sem->owner is NULL.
+ * When the owner is not set, the lock could be free or
+ * held by readers. Check the counter to verify the
+ * state.
*/
- return sem->owner == NULL;
+ count = READ_ONCE(sem->count);
+ return (count == 0 || count == RWSEM_WAITING_BIAS);
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
@@ -359,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
goto done;
while (true) {
- owner = ACCESS_ONCE(sem->owner);
+ owner = READ_ONCE(sem->owner);
if (owner && !rwsem_spin_on_owner(sem, owner))
break;
@@ -433,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = ACCESS_ONCE(sem->count);
+ count = READ_ONCE(sem->count);
/*
* If there were already threads queued before us and there are
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e2d3bc7f03b4..205be0ce34de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -9,29 +9,9 @@
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rwsem.h>
-
#include <linux/atomic.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
+#include "rwsem.h"
/*
* lock for reading
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000000..870ed9a5b426
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
diff --git a/kernel/module.c b/kernel/module.c
index 3965511ae133..ec53f594e9c9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
return 0;
}
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod: the module we're checking
+ *
+ * Returns:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
{
- return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
struct module_use *use;
int printed_something = 0;
- seq_printf(m, " %lu ", module_refcount(mod));
+ seq_printf(m, " %i ", module_refcount(mod));
/*
* Always include a trailing , so userspace can differentiate
@@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}
static struct module_attribute modinfo_refcnt =
@@ -1216,6 +1225,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
const unsigned long *crc;
int err;
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
mutex_lock(&module_mutex);
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1795,7 +1810,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
static void unset_module_init_ro_nx(struct module *mod) { }
#endif
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
{
vfree(module_region);
}
@@ -1804,6 +1819,10 @@ void __weak module_arch_cleanup(struct module *mod)
{
}
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1841,16 +1860,17 @@ static void free_module(struct module *mod)
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
- /* Free lock-classes: */
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
unset_module_core_ro_nx(mod);
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -2291,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
mod->core_size += strtab_size;
+ mod->core_size = debug_align(mod->core_size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
info->index.str) | INIT_OFFSET_MASK;
+ mod->init_size = debug_align(mod->init_size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
@@ -2457,6 +2479,23 @@ static int elf_header_check(struct load_info *info)
return 0;
}
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
struct load_info *info)
@@ -2476,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
vfree(info->hdr);
return -EFAULT;
}
@@ -2785,7 +2824,7 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_free(mod, mod->module_core);
+ module_memfree(mod->module_core);
return -ENOMEM;
}
memset(ptr, 0, mod->init_size);
@@ -2930,8 +2969,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
- module_free(mod, mod->module_init);
- module_free(mod, mod->module_core);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
+ module_memfree(mod->module_core);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2963,6 +3003,12 @@ static bool finished_loading(const char *name)
struct module *mod;
bool ret;
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -2983,10 +3029,36 @@ static void do_mod_ctors(struct module *mod)
#endif
}
-/* This is where the real work happens */
-static int do_init_module(struct module *mod)
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct rcu_head rcu;
+ void *module_init;
+};
+
+static void do_free_init(struct rcu_head *head)
+{
+ struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+ module_memfree(m->module_init);
+ kfree(m);
+}
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
{
int ret = 0;
+ struct mod_initfree *freeinit;
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->module_init = mod->module_init;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -2999,18 +3071,7 @@ static int do_init_module(struct module *mod)
if (mod->init != NULL)
ret = do_one_initcall(mod->init);
if (ret < 0) {
- /*
- * Init routine failed: abort. Try to protect us from
- * buggy refcounters.
- */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
+ goto fail_free_freeinit;
}
if (ret > 0) {
pr_warn("%s: '%s'->init suspiciously returned %d, it should "
@@ -3055,15 +3116,35 @@ static int do_init_module(struct module *mod)
mod->strtab = mod->core_strtab;
#endif
unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
+ module_arch_freeing_init(mod);
mod->module_init = NULL;
mod->init_size = 0;
mod->init_ro_size = 0;
mod->init_text_size = 0;
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this with preempt disabled. In all the failure paths,
+ * we call synchronize_rcu/synchronize_sched, but we don't want
+ * to slow down the success path, so use actual RCU here.
+ */
+ call_rcu(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
}
static int may_init_module(void)
@@ -3075,32 +3156,6 @@ static int may_init_module(void)
}
/*
- * Can't use wait_event_interruptible() because our condition
- * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
- */
-static int wait_finished_loading(struct module *mod)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- add_wait_queue(&module_wq, &wait);
- for (;;) {
- if (finished_loading(mod->name))
- break;
-
- if (signal_pending(current)) {
- ret = -ERESTARTSYS;
- break;
- }
-
- wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
- }
- remove_wait_queue(&module_wq, &wait);
-
- return ret;
-}
-
-/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
* memory exhaustion.
@@ -3120,8 +3175,8 @@ again:
|| old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
-
- err = wait_finished_loading(mod);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
if (err)
goto out_unlocked;
goto again;
@@ -3220,7 +3275,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->sig_ok = info->sig_ok;
if (!mod->sig_ok) {
pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
+ "and/or required key missing - tainting "
"kernel\n", mod->name);
add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
}
@@ -3334,6 +3389,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
module_deallocate(mod, info);
free_copy:
free_copy(info);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
+
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
else
cpumask = pinst->cpumask.pcpu;
- len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
- nr_cpu_ids);
- if (PAGE_SIZE - len < 2)
- len = -EINVAL;
- else
- len += sprintf(buf + len, "\n");
-
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n",
+ nr_cpu_ids, cpumask_bits(cpumask));
mutex_unlock(&pinst->lock);
- return len;
+ return len < PAGE_SIZE ? len : -EINVAL;
}
static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
{ TAINT_OOT_MODULE, 'O', ' ' },
{ TAINT_UNSIGNED_MODULE, 'E', ' ' },
{ TAINT_SOFTLOCKUP, 'L', ' ' },
+ { TAINT_LIVEPATCH, 'K', ' ' },
};
/**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
* 'O' - Out-of-tree module has been loaded.
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
+ * 'K' - Kernel has been live patched.
*
* The string is overwritten by the next call to print_tainted().
*/
diff --git a/kernel/params.c b/kernel/params.c
index 0af9b2c4e56c..728e05b167de 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -642,12 +642,15 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
+ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
mk->mp->attrs[mk->mp->num].param = kp;
mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
/* Do not allow runtime DAC changes to make param writable. */
if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ else
+ mk->mp->attrs[mk->mp->num].mattr.store = NULL;
mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
mk->mp->num++;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
config PM_OPP
bool
+ select SRCU
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = elapsed_msecs64;
if (todo) {
- printk("\n");
- printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
+ pr_cont("\n");
+ pr_err("Freezing of tasks %s after %d.%03d seconds "
"(%d tasks refusing to freeze, wq_busy=%d):\n",
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
read_unlock(&tasklist_lock);
}
} else {
- printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
+ pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
}
-static bool __check_frozen_processes(void)
-{
- struct task_struct *g, *p;
-
- for_each_process_thread(g, p)
- if (p != current && !freezer_should_skip(p) && !frozen(p))
- return false;
-
- return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
- bool ret;
-
- read_lock(&tasklist_lock);
- ret = __check_frozen_processes();
- read_unlock(&tasklist_lock);
- return ret;
-}
-
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
int freeze_processes(void)
{
int error;
- int oom_kills_saved;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
atomic_inc(&system_freezing_cnt);
pm_wakeup_clear();
- printk("Freezing user space processes ... ");
+ pr_info("Freezing user space processes ... ");
pm_freezing = true;
- oom_kills_saved = oom_kills_count();
error = try_to_freeze_tasks(true);
if (!error) {
__usermodehelper_set_disable_depth(UMH_DISABLED);
- oom_killer_disable();
-
- /*
- * There might have been an OOM kill while we were
- * freezing tasks and the killed task might be still
- * on the way out so we have to double check for race.
- */
- if (oom_kills_count() != oom_kills_saved &&
- !check_frozen_processes()) {
- __usermodehelper_set_disable_depth(UMH_ENABLED);
- printk("OOM in progress.");
- error = -EBUSY;
- } else {
- printk("done.");
- }
+ pr_cont("done.");
}
- printk("\n");
+ pr_cont("\n");
BUG_ON(in_atomic());
+ /*
+ * Now that the whole userspace is frozen we need to disbale
+ * the OOM killer to disallow any further interference with
+ * killable tasks.
+ */
+ if (!error && !oom_killer_disable())
+ error = -EBUSY;
+
if (error)
thaw_processes();
return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
{
int error;
- printk("Freezing remaining freezable tasks ... ");
+ pr_info("Freezing remaining freezable tasks ... ");
+
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
if (!error)
- printk("done.");
+ pr_cont("done.");
- printk("\n");
+ pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
oom_killer_enable();
- printk("Restarting tasks ... ");
+ pr_info("Restarting tasks ... ");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- printk("done.\n");
+ pr_cont("done.\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- printk("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ... ");
thaw_workqueues();
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
read_unlock(&tasklist_lock);
schedule();
- printk("done.\n");
+ pr_cont("done.\n");
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
#include <linux/platform_device.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
c->target_value = value;
}
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+ struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+ struct pm_qos_constraints *c;
+ struct pm_qos_request *req;
+ char *type;
+ unsigned long flags;
+ int tot_reqs = 0;
+ int active_reqs = 0;
+
+ if (IS_ERR_OR_NULL(qos)) {
+ pr_err("%s: bad qos param!\n", __func__);
+ return -EINVAL;
+ }
+ c = qos->constraints;
+ if (IS_ERR_OR_NULL(c)) {
+ pr_err("%s: Bad constraints on qos?\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Lock to ensure we have a snapshot */
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ if (plist_head_empty(&c->list)) {
+ seq_puts(s, "Empty!\n");
+ goto out;
+ }
+
+ switch (c->type) {
+ case PM_QOS_MIN:
+ type = "Minimum";
+ break;
+ case PM_QOS_MAX:
+ type = "Maximum";
+ break;
+ case PM_QOS_SUM:
+ type = "Sum";
+ break;
+ default:
+ type = "Unknown";
+ }
+
+ plist_for_each_entry(req, &c->list, node) {
+ char *state = "Default";
+
+ if ((req->node).prio != c->default_value) {
+ active_reqs++;
+ state = "Active";
+ }
+ tot_reqs++;
+ seq_printf(s, "%d: %d: %s\n", tot_reqs,
+ (req->node).prio, state);
+ }
+
+ seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+ type, pm_qos_get_value(c), active_reqs, tot_reqs);
+
+out:
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return 0;
+}
+
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pm_qos_dbg_show_requests,
+ inode->i_private);
+}
+
+static const struct file_operations pm_qos_debug_fops = {
+ .open = pm_qos_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/**
* pm_qos_update_target - manages the constraints list and calls the notifiers
* if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
{
qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+ if (d) {
+ (void)debugfs_create_file(qos->name, S_IRUGO, d,
+ (void *)qos, &pm_qos_debug_fops);
+ }
+
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
{
int ret = 0;
int i;
+ struct dentry *d;
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+ d = debugfs_create_dir("pm_qos", NULL);
+ if (IS_ERR_OR_NULL(d))
+ d = NULL;
+
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
- ret = register_pm_qos_misc(pm_qos_array[i]);
+ ret = register_pm_qos_misc(pm_qos_array[i], d);
if (ret < 0) {
printk(KERN_ERR "pm_qos_param: %s setup failed\n",
pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..5235dd4e1e2f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
}
}
-static bool is_nosave_page(unsigned long pfn)
-{
- struct nosave_region *region;
-
- list_for_each_entry(region, &nosave_regions, list) {
- if (pfn >= region->start_pfn && pfn < region->end_pfn) {
- pr_err("PM: %#010llx in e820 nosave region: "
- "[mem %#010llx-%#010llx]\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (unsigned long long) region->start_pfn << PAGE_SHIFT,
- ((unsigned long long) region->end_pfn << PAGE_SHIFT)
- - 1);
- return true;
- }
- }
-
- return false;
-}
-
/**
* create_basic_memory_bitmaps - create bitmaps needed for marking page
* frames that should not be saved and free page frames. The pointers
@@ -1472,9 +1453,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
/**
* free_unnecessary_pages - Release preallocated pages not needed for the image
*/
-static void free_unnecessary_pages(void)
+static unsigned long free_unnecessary_pages(void)
{
- unsigned long save, to_free_normal, to_free_highmem;
+ unsigned long save, to_free_normal, to_free_highmem, free;
save = count_data_pages();
if (alloc_normal >= save) {
@@ -1495,6 +1476,7 @@ static void free_unnecessary_pages(void)
else
to_free_normal = 0;
}
+ free = to_free_normal + to_free_highmem;
memory_bm_position_reset(&copy_bm);
@@ -1518,6 +1500,8 @@ static void free_unnecessary_pages(void)
swsusp_unset_page_free(page);
__free_page(page);
}
+
+ return free;
}
/**
@@ -1707,7 +1691,7 @@ int hibernate_preallocate_memory(void)
* pages in memory, but we have allocated more. Release the excessive
* ones now.
*/
- free_unnecessary_pages();
+ pages -= free_unnecessary_pages();
out:
stop = ktime_get();
@@ -2039,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
do {
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
+ if (likely(pfn_valid(pfn)))
swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
@@ -2310,8 +2294,6 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static inline int get_safe_write_buffer(void) { return 0; }
-
static unsigned int
count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
void freeze_set_ops(const struct platform_freeze_ops *ops)
{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
static void freeze_begin(void)
{
- suspend_freeze_wake = false;
+ suspend_freeze_state = FREEZE_STATE_NONE;
}
static void freeze_enter(void)
{
- cpuidle_use_deepest_state(true);
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
cpuidle_resume();
- wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
cpuidle_pause();
- cpuidle_use_deepest_state(false);
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
}
void freeze_wake(void)
{
- suspend_freeze_wake = true;
- wake_up(&suspend_freeze_wait_head);
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
}
EXPORT_SYMBOL_GPL(freeze_wake);
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index cbd69d842341..2ca4a8b5fe57 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -3,7 +3,7 @@
struct console_cmdline
{
- char name[8]; /* Name of the driver */
+ char name[16]; /* Name of the driver */
int index; /* Minor dev. to use */
char *options; /* Options for the driver */
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..bb0635bd74f2 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
- "print all kernel messages to the console.");
+MODULE_PARM_DESC(ignore_loglevel,
+ "ignore loglevel setting (prints all kernel messages to the console)");
#ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
}
/*
- * Zap console related locks when oopsing. Only zap at most once
- * every 10 seconds, to leave time for slow consoles to print a
- * full oops.
+ * Zap console related locks when oopsing.
+ * To leave time for slow consoles to print a full oops,
+ * only zap at most once every 30 seconds.
*/
static void zap_locks(void)
{
static unsigned long oops_timestamp;
if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
return;
oops_timestamp = jiffies;
@@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args)
#ifdef CONFIG_KGDB_KDB
if (unlikely(kdb_trap_printk)) {
- r = vkdb_printf(fmt, args);
+ r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
return r;
}
#endif
@@ -2464,6 +2464,7 @@ void register_console(struct console *newcon)
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
i++, c++) {
+ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
if (strcmp(c->name, newcon->name) != 0)
continue;
if (newcon->index >= 0 &&
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
- seq_cpumask(m, prof_cpu_mask);
- seq_putc(m, '\n');
+ seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
return 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
}
#if defined CONFIG_COMPAT
-#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
void rcu_early_boot_tests(void);
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
- int (*completed)(void);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
rcu_read_unlock();
}
-static int rcu_torture_completed(void)
-{
- return rcu_batches_completed();
-}
-
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
cur_ops->deferred_free(rp);
}
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
{
return 0;
}
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
- .completed = rcu_torture_completed,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
rcu_read_unlock_bh();
}
-static int rcu_bh_torture_completed(void)
-{
- return rcu_batches_completed_bh();
-}
-
static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
{
call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
- .completed = rcu_bh_torture_completed,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
.deferred_free = rcu_bh_torture_deferred_free,
.sync = synchronize_rcu_bh,
.exp_sync = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .started = rcu_no_completed,
.completed = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
srcu_read_unlock(&srcu_ctl, idx);
}
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(&srcu_ctl);
}
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .started = NULL,
.completed = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
- .completed = rcu_no_completed,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
.readlock = tasks_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = tasks_torture_read_unlock,
+ .started = rcu_no_completed,
.completed = rcu_no_completed,
.deferred_free = rcu_tasks_torture_deferred_free,
.sync = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
static void rcu_torture_timer(unsigned long unused)
{
int idx;
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
static DEFINE_TORTURE_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
unsigned long long ts;
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
- completed, completed_end);
+ started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
static int
rcu_torture_reader(void *arg)
{
- int completed;
- int completed_end;
+ unsigned long started;
+ unsigned long completed;
int idx;
DEFINE_TORTURE_RANDOM(rand);
struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
mod_timer(&t, jiffies + 1);
}
idx = cur_ops->readlock();
- completed = cur_ops->completed();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed_end = cur_ops->completed();
+ completed = cur_ops->completed();
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, completed, completed_end);
+ ts, started, completed);
rcutorture_trace_dump();
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed_end - completed;
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
n_rcu_torture_barrier_error++;
+ pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+ atomic_read(&barrier_cbs_invoked),
+ n_barrier_cbs);
WARN_ON_ONCE(1);
}
n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
* Report the number of batches, correlated with, but not necessarily
* precisely the same as, the number of grace periods that have elapsed.
*/
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
{
return sp->completed;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-
#include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
- if (newval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("--="),
- rcu_dynticks_nesting, newval));
- rcu_dynticks_nesting = newval;
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
- rcu_dynticks_nesting, newval));
- if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
- rcu_dynticks_nesting, newval));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
- rcu_sched_qs(); /* implies rcu_bh_inc() */
- barrier();
- rcu_dynticks_nesting = newval;
-}
-
/*
* Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
- if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE)
- newval = 0;
- else
- newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
- long long newval;
-
- local_irq_save(flags);
- newval = rcu_dynticks_nesting - 1;
- WARN_ON_ONCE(newval < 0);
- rcu_idle_enter_common(newval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
- if (oldval) {
- RCU_TRACE(trace_rcu_dyntick(TPS("++="),
- oldval, rcu_dynticks_nesting));
- return;
- }
- RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
- if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-
- RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
- oldval, rcu_dynticks_nesting));
- ftrace_dump(DUMP_ALL);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
-}
-
/*
* Exit idle, so that we are no longer in an extended quiescent state.
*/
void rcu_idle_exit(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- WARN_ON_ONCE(rcu_dynticks_nesting < 0);
- if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
- rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- else
- rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
- long long oldval;
-
- local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting++;
- WARN_ON_ONCE(rcu_dynticks_nesting == 0);
- rcu_idle_exit_common(oldval);
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
*/
bool notrace __rcu_is_watching(void)
{
- return rcu_dynticks_nesting;
+ return true;
}
EXPORT_SYMBOL(__rcu_is_watching);
#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
/*
- * Test whether the current CPU was interrupted from idle. Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
- return rcu_dynticks_nesting <= 1;
-}
-
-/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
* Also irqs are disabled to avoid confusion due to interrupt handlers
* invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
- if (user || rcu_is_cpu_rrupt_from_idle())
+ if (user)
rcu_sched_qs();
else if (!in_softirq())
rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
rcp->curtail = &head->next;
RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_sched_qs() */
+ resched_cpu(0);
+ }
}
/*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
rcp->ticks_this_gp++;
j = jiffies;
js = ACCESS_ONCE(rcp->jiffies_stall);
- if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- }
- if (*rcp->curtail && ULONG_CMP_GE(j, js))
ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
- else if (ULONG_CMP_GE(j, js))
+ } else if (ULONG_CMP_GE(j, js)) {
ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ }
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
};
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
/*
* Let the RCU core know that this CPU has gone through the scheduler,
* which is a quiescent state. This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiesecent state for all RCU flavors. If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them). Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(void);
/*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+ return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+ return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
*/
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+ return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+ return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
{
return rcu_sched_state.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
/*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
*/
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
{
return rcu_bh_state.completed;
}
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
/**
* rcu_nmi_enter - inform RCU of entry to NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is active.
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
*/
void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int incby = 2;
- if (rdtp->dynticks_nmi_nesting == 0 &&
- (atomic_read(&rdtp->dynticks) & 0x1))
- return;
- rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ /* Complain about underflow. */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+ smp_mb__before_atomic(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* atomic_inc() before later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ incby = 1;
+ }
+ rdtp->dynticks_nmi_nesting += incby;
+ barrier();
}
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
*
- * If the CPU was idle with dynamic ticks active, and there is no
- * irq handler running, this updates rdtp->dynticks_nmi to let the
- * RCU grace-period handling know that the CPU is no longer active.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
*/
void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- if (rdtp->dynticks_nmi_nesting == 0 ||
- --rdtp->dynticks_nmi_nesting != 0)
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ rdtp->dynticks_nmi_nesting -= 2;
return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ rdtp->dynticks_nmi_nesting = 0;
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
+ if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ rdp->mynode->gpnum))
+ ACCESS_ONCE(rdp->gpwrap) = true;
return 0;
}
}
/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
j1 = rcu_jiffies_till_stall_check();
ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
+ rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+ unsigned long gpa;
+ unsigned long j;
+
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ if (j - gpa > 2 * HZ)
+ pr_err("%s kthread starved for %ld jiffies!\n",
+ rsp->name, j - gpa);
}
/*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
long delta;
unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
int ndetected = 0;
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- /*
- * Now rat on any tasks that got kicked up to the root rcu_node
- * due to CPU offlining.
- */
- rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected += rcu_print_task_stall(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (ndetected == 0)
- pr_err("INFO: Stall ended before state dump start\n");
- else
+ if (ndetected) {
rcu_dump_cpu_stacks(rsp);
+ } else {
+ if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+ ACCESS_ONCE(rsp->completed) == gpnum) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+ rsp->name, j - gpa, j, gpa,
+ jiffies_till_next_fqs);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
/* Complain about tasks blocking the grace period. */
-
rcu_print_detail_task_stall(rsp);
+ rcu_check_gp_kthread_starvation(rsp);
+
force_quiescent_state(rsp); /* Kick them all. */
}
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
+
+ rcu_check_gp_kthread_starvation(rsp);
+
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
/* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(rsp);
+ print_other_cpu_stall(rsp, gpnum);
}
}
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
bool ret;
/* Handle the ends of any preceding grace periods first. */
- if (rdp->completed == rnp->completed) {
+ if (rdp->completed == rnp->completed &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
+ ACCESS_ONCE(rdp->gpwrap) = false;
}
return ret;
}
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+ rdp->completed == ACCESS_ONCE(rnp->completed) &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
ACCESS_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
- rnp->completed == rnp->gpnum) {
+ if ((rdp->passed_quiesce == 0 &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+ rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+ rdp->gpwrap) {
/*
* The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* within the current grace period.
*/
rdp->passed_quiesce = 0; /* need qs for new gp. */
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesce)
+ if (!rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
return;
/*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
}
/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ rnp->qsmaskinit &= ~mask;
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+}
+
+/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
- unsigned long mask;
- int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
rcu_adopt_orphan_cbs(rsp, flags);
+ raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
- /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
- mask = rdp->grpmask; /* rnp->grplo is constant. */
- do {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rnp->qsmaskinit &= ~mask;
- if (rnp->qsmaskinit != 0) {
- if (rnp != rdp->mynode)
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- break;
- }
- if (rnp == rdp->mynode)
- need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
- else
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- mask = rnp->grpmask;
- rnp = rnp->parent;
- } while (rnp != NULL);
-
- /*
- * We still hold the leaf rcu_node structure lock here, and
- * irqs are still disabled. The reason for this subterfuge is
- * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
- * held leads to deadlock.
- */
- raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
- rnp = rdp->mynode;
- if (need_report & RCU_OFL_TASKS_NORM_GP)
- rcu_report_unblock_qs_rnp(rnp, flags);
- else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (need_report & RCU_OFL_TASKS_EXP_GP)
- rcu_report_exp_rnp(rsp, rnp, true);
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinit &= ~rdp->grpmask;
+ if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
+ rcu_cleanup_dead_rnp(rnp);
+ rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
}
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
- rnp = rcu_get_root(rsp);
- if (rnp->qsmask == 0) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- }
}
/*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
* Schedule RCU callback invocation. If the specified type of RCU
* does not support RCU priority boosting, just do a direct call,
* otherwise wake up the per-CPU kernel kthread. Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
* rcu_cpu_kthread_task cannot disappear out from under us.
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce) {
+ rdp->qs_pending && !rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_qs_pending++;
- } else if (rdp->qs_pending && rdp->passed_quiesce) {
+ } else if (rdp->qs_pending &&
+ (rdp->passed_quiesce ||
+ rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+ if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
rsp->n_barrier_done);
+ smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- init_callback_list(rdp);
- rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed;
rdp->completed = rnp->completed;
rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = 0;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
}
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
+ int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct rcu_state *rsp;
+ struct sched_param sp;
struct task_struct *t;
+ /* Force priority into range. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+ kthread_prio, kthread_prio_in);
+
rcu_scheduler_fully_active = 1;
for_each_rcu_flavor(rsp) {
- t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+ t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
rsp->gp_kthread = t;
+ if (kthread_prio) {
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ wake_up_process(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
-#include <linux/irq_work.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
- struct completion boost_completion;
- /* Used to ensure that the rt_mutex used */
- /* to carry out the boosting is fully */
- /* released with no future boostee accesses */
- /* before that rt_mutex is re-initialized. */
struct rt_mutex boost_mtx;
/* Used only for the priority-boosting */
/* side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
+ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+ /* for rcu_all_qs() invocations. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
+ bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
#ifdef CONFIG_RCU_NOCB_CPU
struct rcu_head *nocb_head; /* CBs waiting for kthread. */
struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
- atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
+ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
- atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
- atomic_long_t nocb_follower_count_lazy; /* (approximate). */
- int nocb_p_count; /* # CBs being invoked by kthread */
- int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
/* CBs waiting for GP. */
struct rcu_head **nocb_gp_tail;
- long nocb_gp_count;
- long nocb_gp_count_lazy;
bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
struct rcu_data *nocb_next_follower;
/* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
/* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
+ unsigned long gp_activity; /* Time of last GP kthread */
+ /* activity in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
+ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
+ /* GP start. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
const char *name; /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
#define for_each_rcu_flavor(rsp) \
list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-
-#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
- /* GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
- /* GP were moved to root. */
-
/*
* RCU implementation internal declarations:
*/
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
- unsigned long flags);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
#endif /* #ifndef RCU_TREE_NONCORE */
#ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
-/* Sum up queue lengths for tracing. */
+/* Read out queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) +
- rdp->nocb_p_count +
- atomic_long_read(&rdp->nocb_follower_count) +
- rdp->nocb_p_count + rdp->nocb_gp_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
- rdp->nocb_p_count_lazy +
- atomic_long_read(&rdp->nocb_follower_count_lazy) +
- rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
+#ifdef CONFIG_RCU_NOCB_CPU
+ *ql = atomic_long_read(&rdp->nocb_q_count);
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
*ql = 0;
*qll = 0;
-}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..0a571e9a0f1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
#include "../locking/rtmutex_common.h"
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
-
/*
* Control variables for per-CPU and per-rcu_node kthreads. These
* handle all flavors of RCU.
@@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static char __initdata nocb_buf[NR_CPUS * 5];
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/*
@@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake);
/*
* Tell them what RCU they are running.
@@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static long rcu_batches_completed_preempt(void)
-{
- return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
@@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
}
/*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+ return !list_empty(&rnp->blkd_tasks);
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
*/
void rcu_read_unlock_special(struct task_struct *t)
{
- int empty;
- int empty_exp;
- int empty_exp_now;
+ bool empty;
+ bool empty_exp;
+ bool empty_norm;
+ bool empty_exp_now;
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
@@ -338,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t)
special = t->rcu_read_unlock_special;
if (special.b.need_qs) {
rcu_preempt_qs();
+ t->rcu_read_unlock_special.b.need_qs = false;
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty = !rcu_preempt_has_tasks(rnp);
+ empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
+ * If this was the last task on the list, go see if we
+ * need to propagate ->qsmaskinit bit clearing up the
+ * rcu_node tree.
+ */
+ if (!empty && !rcu_preempt_has_tasks(rnp))
+ rcu_cleanup_dead_rnp(rnp);
+
+ /*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
empty_exp_now = !rcu_preempted_readers_exp(rnp);
- if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (drop_boost_mutex) {
+ if (drop_boost_mutex)
rt_mutex_unlock(&rnp->boost_mtx);
- complete(&rnp->boost_completion);
- }
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (!list_empty(&rnp->blkd_tasks))
+ if (rcu_preempt_has_tasks(rnp))
rnp->gp_tasks = rnp->blkd_tasks.next;
WARN_ON_ONCE(rnp->qsmask);
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline. Move them up to the root
- * rcu_node. The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- struct list_head *lp;
- struct list_head *lp_root;
- int retval = 0;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *t;
-
- if (rnp == rnp_root) {
- WARN_ONCE(1, "Last CPU thought to be offlined?");
- return 0; /* Shouldn't happen: at least one CPU online. */
- }
-
- /* If we are on an internal node, complain bitterly. */
- WARN_ON_ONCE(rnp != rdp->mynode);
-
- /*
- * Move tasks up to root rcu_node. Don't try to get fancy for
- * this corner-case operation -- just put this node's tasks
- * at the head of the root node's list, and update the root node's
- * ->gp_tasks and ->exp_tasks pointers to those of this node's,
- * if non-NULL. This might result in waiting for more tasks than
- * absolutely necessary, but this is a good performance/complexity
- * tradeoff.
- */
- if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
- retval |= RCU_OFL_TASKS_NORM_GP;
- if (rcu_preempted_readers_exp(rnp))
- retval |= RCU_OFL_TASKS_EXP_GP;
- lp = &rnp->blkd_tasks;
- lp_root = &rnp_root->blkd_tasks;
- while (!list_empty(lp)) {
- t = list_entry(lp->next, typeof(*t), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- list_del(&t->rcu_node_entry);
- t->rcu_blocked_node = rnp_root;
- list_add(&t->rcu_node_entry, lp_root);
- if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp_root->gp_tasks = rnp->gp_tasks;
- if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
- }
-
- rnp->gp_tasks = NULL;
- rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
- rnp->boost_tasks = NULL;
- /*
- * In case root is being boosted and leaf was not. Make sure
- * that we boost the tasks blocking the current grace period
- * in this case.
- */
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- if (rnp_root->boost_tasks != NULL &&
- rnp_root->boost_tasks != rnp_root->gp_tasks &&
- rnp_root->boost_tasks != rnp_root->exp_tasks)
- rnp_root->boost_tasks = rnp_root->gp_tasks;
- raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
- return retval;
-}
-
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (list_empty(&rnp->blkd_tasks)) {
+ if (!rcu_preempt_has_tasks(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else {
rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
}
/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
- return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
#ifdef CONFIG_HOTPLUG_CPU
-/* Because preemptible RCU does not exist, no quieting of tasks. */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
- __releases(rnp->lock)
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return false;
}
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
WARN_ON_ONCE(rnp->qsmask);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
- struct rcu_node *rnp,
- struct rcu_data *rdp)
-{
- return 0;
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
static void rcu_initiate_boost_trace(struct rcu_node *rnp)
{
- if (list_empty(&rnp->blkd_tasks))
+ if (!rcu_preempt_has_tasks(rnp))
rnp->n_balk_blkd_tasks++;
else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+ ACCESS_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
- init_completion(&rnp->boost_completion);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
- /* Wait for boostee to be done w/boost_mtx before reinitializing. */
- wait_for_completion(&rnp->boost_completion);
-
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
}
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0) {
+ if (cpumask_weight(cm) == 0)
cpumask_setall(cm);
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
- cpumask_clear_cpu(cpu, cm);
- WARN_ON_ONCE(cpumask_weight(cm) == 0);
- }
set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rnp = rcu_get_root(rcu_state_p);
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state_p, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
- }
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* completed since we last checked and there are
* callbacks not yet ready to invoke.
*/
- if (rdp->completed != rnp->completed &&
+ if ((rdp->completed != rnp->completed ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+ pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
- /* No-CBs CPUs might have callbacks on any of three lists. */
+ /*
+ * Check count of all no-CBs callbacks awaiting invocation.
+ * There needs to be a barrier before this function is called,
+ * but associated with a prior determination that no more
+ * callbacks would be posted. In the worst case, the first
+ * barrier in _rcu_barrier() suffices (but the caller cannot
+ * necessarily rely on this, not a substitute for the caller
+ * getting the concurrency design right!). There must also be
+ * a barrier between the following load an posting of a callback
+ * (if a callback is in fact needed). This is associated with an
+ * atomic_inc() in the caller.
+ */
+ ret = atomic_long_read(&rdp->nocb_q_count);
+
+#ifdef CONFIG_PROVE_RCU
rhp = ACCESS_ONCE(rdp->nocb_head);
if (!rhp)
rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
cpu, rhp->func);
WARN_ON_ONCE(1);
}
+#endif /* #ifdef CONFIG_PROVE_RCU */
- return !!rhp;
+ return !!ret;
}
/*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
struct task_struct *t;
/* Enqueue the callback on the nocb list and update counts. */
+ atomic_long_add(rhcount, &rdp->nocb_q_count);
+ /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
ACCESS_ONCE(*old_rhpp) = rhp;
- atomic_long_add(rhcount, &rdp->nocb_q_count);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
@@ -2288,9 +2169,6 @@ wait_again:
/* Move callbacks to wait-for-GP list, which is empty. */
ACCESS_ONCE(rdp->nocb_head) = NULL;
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
- rdp->nocb_gp_count_lazy =
- atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
gotcbs = true;
}
@@ -2338,9 +2216,6 @@ wait_again:
/* Append callbacks to follower's "done" list. */
tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
*tail = rdp->nocb_gp_head;
- atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
- atomic_long_add(rdp->nocb_gp_count_lazy,
- &rdp->nocb_follower_count_lazy);
smp_mb__after_atomic(); /* Store *tail before wakeup. */
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
- c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
- rdp->nocb_p_count += c;
- rdp->nocb_p_count_lazy += cl;
/* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+ trace_rcu_batch_start(rdp->rsp->name,
+ atomic_long_read(&rdp->nocb_q_count_lazy),
+ atomic_long_read(&rdp->nocb_q_count), -1);
c = cl = 0;
while (list) {
next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
- ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) =
- rdp->nocb_p_count_lazy - cl;
+ smp_mb__before_atomic(); /* _add after CB invocation. */
+ atomic_long_add(-c, &rdp->nocb_q_count);
+ atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
rdp->n_nocbs_invoked += c;
}
return 0;
@@ -2513,8 +2386,8 @@ void __init rcu_init_nohz(void)
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
- cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
- pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
#define RCU_TREE_NONCORE
#include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->passed_quiesce, rdp->qs_pending);
+ rdp->passed_quiesce,
+ rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/resource_ext.h>
#include <asm/io.h>
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
return err;
}
+struct resource_entry *resource_list_create_entry(struct resource *res,
+ size_t extra_size)
+{
+ struct resource_entry *entry;
+
+ entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+ if (entry) {
+ INIT_LIST_HEAD(&entry->node);
+ entry->res = res ? res : &entry->__res;
+ }
+
+ return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+ struct resource_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, node)
+ resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
EXPORT_SYMBOL_GPL(cpu_clock);
EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
unsigned long flags;
int ret = 1;
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!READ_ONCE(x->done))
+ return 0;
+
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
@@ -288,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- unsigned long flags;
- int ret = 1;
+ if (!READ_ONCE(x->done))
+ return false;
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..2f7937ee9e3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- if (rq->skip_clock_update > 0)
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_skip_update & RQCF_ACT_SKIP)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -304,65 +306,8 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
/*
* this_rq_lock - lock this runqueue and disable interrupts.
@@ -490,6 +435,11 @@ static __init void init_hrtick(void)
*/
void hrtick_start(struct rq *rq, u64 delay)
{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -743,6 +693,23 @@ static inline bool got_nohz_idle_kick(void)
bool sched_can_stop_tick(void)
{
/*
+ * FIFO realtime policy runs the highest priority task. Other runnable
+ * tasks are of a lower priority. The scheduler tick does nothing.
+ */
+ if (current->policy == SCHED_FIFO)
+ return true;
+
+ /*
+ * Round-robin realtime tasks time slice with other tasks at the same
+ * realtime priority. Is this task the only one at this priority?
+ */
+ if (current->policy == SCHED_RR) {
+ struct sched_rt_entity *rt_se = &current->rt;
+
+ return rt_se->run_list.prev == rt_se->run_list.next;
+ }
+
+ /*
* More than one running task need preemption.
* nr_running update is assumed to be visible
* after IPI is sent from wakers.
@@ -1046,7 +1013,14 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
+}
+
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
}
#ifdef CONFIG_SMP
@@ -1079,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ struct task_migration_notifier tmn;
+
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ tmn.task = p;
+ tmn.from_cpu = task_cpu(p);
+ tmn.to_cpu = new_cpu;
+
+ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
@@ -1814,6 +1796,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1832,6 +1818,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SMP
+ p->se.avg.decay_count = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1828,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2038,9 @@ static inline int dl_bw_cpus(int i)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2748,6 +2740,10 @@ again:
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
*/
static void __sched __schedule(void)
{
@@ -2756,7 +2752,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
-need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -2776,6 +2771,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2797,13 @@ need_resched:
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev))
update_rq_clock(rq);
next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->skip_clock_update = 0;
+ rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -2821,8 +2818,6 @@ need_resched:
post_schedule(rq);
sched_preempt_enable_no_resched();
- if (need_resched())
- goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2837,9 @@ asmlinkage __visible void __sched schedule(void)
struct task_struct *tsk = current;
sched_submit_work(tsk);
- __schedule();
+ do {
+ __schedule();
+ } while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2877,6 +2874,21 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2904,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
- } while (need_resched());
+ preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
@@ -3067,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
} else {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
p->sched_class = &fair_sched_class;
}
@@ -3251,15 +3255,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -3382,6 +3402,20 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+static bool dl_param_changed(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
@@ -3510,7 +3544,7 @@ recheck:
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
- if (dl_policy(policy))
+ if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4236,10 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-static void __cond_resched(void)
-{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
int __sched _cond_resched(void)
{
if (should_resched()) {
- __cond_resched();
+ preempt_schedule_common();
return 1;
}
return 0;
@@ -4237,7 +4264,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
- __cond_resched();
+ preempt_schedule_common();
else
cpu_relax();
ret = 1;
@@ -4253,7 +4280,7 @@ int __sched __cond_resched_softirq(void)
if (should_resched()) {
local_bh_enable();
- __cond_resched();
+ preempt_schedule_common();
local_bh_disable();
return 1;
}
@@ -4368,36 +4395,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -4508,9 +4528,10 @@ void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned state;
+ unsigned long state = p->state;
- state = p->state ? __ffs(p->state) + 1 : 0;
+ if (state)
+ state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
@@ -4642,6 +4663,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
struct dl_bw *cur_dl_b;
unsigned long flags;
+ if (!cpumask_weight(cur))
+ return ret;
+
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4764,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
+ if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5331,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb,
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- unsigned long flags;
- long cpu = (long)hcpu;
- struct dl_bw *dl_b;
-
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active(cpu, false);
-
- /* explicitly allow suspend */
- if (!(action & CPU_TASKS_FROZEN)) {
- bool overflow;
- int cpus;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
- return notifier_from_errno(-EBUSY);
- }
+ set_cpu_active((long)hcpu, false);
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
}
-
- return NOTIFY_DONE;
}
static int __init migration_init(void)
@@ -5408,9 +5409,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
- char str[256];
- cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5423,7 +5422,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n", str, sd->name);
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5442,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5468,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
@@ -5826,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
@@ -5937,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -6248,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
*/
if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -7013,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
*/
case CPU_ONLINE:
- case CPU_DOWN_FAILED:
cpuset_update_active_cpus(true);
break;
default:
@@ -7025,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action) {
+ unsigned long flags;
+ long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ bool overflow;
+ int cpus;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
@@ -7171,8 +7177,8 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
- init_rt_rq(&rq->rt, rq);
- init_dl_rq(&rq->dl, rq);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -7212,7 +7218,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -7250,6 +7256,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7259,11 +7270,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7292,13 +7298,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- if (WARN_ONCE(current->state != TASK_RUNNING,
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
"state=%lx set at [<%p>] %pS\n",
current->state,
(void *)current->task_state_change,
- (void *)current->task_state_change))
- __set_current_state(TASK_RUNNING);
+ (void *)current->task_state_change);
___might_sleep(file, line, preempt_offset);
}
@@ -7325,6 +7330,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
in_atomic(), irqs_disabled(),
current->pid, current->comm);
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
@@ -7588,6 +7596,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg)
return 1;
@@ -7680,6 +7694,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7736,9 +7761,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -7795,7 +7817,7 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_constraints(void)
+static int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
@@ -7896,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;
- ret = sched_rt_global_constraints();
+ ret = sched_dl_global_validate();
if (ret)
goto undo;
- ret = sched_dl_global_constraints();
+ ret = sched_rt_global_constraints();
if (ret)
goto undo;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
- if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
}
/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
if (!cp->elements)
return -ENOMEM;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
kfree(cp->elements);
return -ENOMEM;
}
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
for_each_possible_cpu(i)
cp->elements[i].idx = IDX_INVALID;
- cpumask_setall(cp->free_cpus);
-
return 0;
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..5e95145088fd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b)
dl_b->total_bw = 0;
}
-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->rb_root = RB_ROOT;
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq)
rq->post_schedule = has_pushable_dl_tasks(rq);
}
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, ENQUEUE_REPLENISH);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+}
+
#else
static inline
@@ -350,6 +396,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
}
/*
@@ -506,16 +557,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(p, &flags);
/*
* We need to take care of several possible races here:
@@ -536,25 +581,52 @@ again:
sched_clock_tick();
update_rq_clock(rq);
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- if (task_on_rq_queued(p)) {
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
+
#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
- */
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
+ /*
+ * If we find that the rq the task was on is no longer
+ * available, we need to select a new rq.
+ */
+ if (unlikely(!rq->online)) {
+ dl_task_offline_migration(rq, p);
+ goto unlock;
+ }
#endif
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
}
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, p, &flags);
return HRTIMER_NORESTART;
}
@@ -613,10 +685,9 @@ static void update_curr_dl(struct rq *rq)
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
- dl_se->dl_throttled = 1;
- else
+ if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +924,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
*/
- if (p->dl.dl_throttled)
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -898,7 +969,14 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -1073,7 +1151,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
}
@@ -1094,6 +1178,7 @@ static void task_dead_dl(struct task_struct *p)
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
@@ -1165,9 +1250,6 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- cpumask_copy(later_mask, task_rq(task)->rd->span);
- cpumask_and(later_mask, later_mask, cpu_active_mask);
- cpumask_and(later_mask, later_mask, &task->cpus_allowed);
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
task, later_mask);
if (best_cpu == -1)
@@ -1562,6 +1644,7 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
}
@@ -1573,6 +1656,7 @@ static void rq_offline_dl(struct rq *rq)
dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
void init_sched_dl_class(void)
@@ -1614,8 +1698,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
+ /* XXX we should retain the bw until 0-lag */
cancel_dl_timer(rq, p);
-
__dl_clear_params(p);
/*
@@ -1638,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
- /*
- * If p is throttled, don't consider the possibility
- * of preempting rq->curr, the check will be done right
- * after its runtime will get replenished.
- */
- if (unlikely(p->dl.dl_throttled))
- return;
-
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..a245c1fc6f0a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
if (!se) {
struct sched_avg *avg = &cpu_rq(cpu)->avg;
P(avg->runnable_avg_sum);
- P(avg->runnable_avg_period);
+ P(avg->avg_period);
return;
}
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P(se->load.weight);
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
- P(se->avg.runnable_avg_period);
+ P(se->avg.running_avg_sum);
+ P(se->avg.avg_period);
P(se->avg.load_avg_contrib);
+ P(se->avg.utilization_avg_contrib);
P(se->avg.decay_count);
#endif
#undef PN
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
+ cfs_rq->utilization_load_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
cfs_rq->tg_load_contrib);
@@ -305,6 +309,7 @@ do { \
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
+ PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
@@ -635,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
P(se.avg.runnable_avg_sum);
- P(se.avg.runnable_avg_period);
+ P(se.avg.running_avg_sum);
+ P(se.avg.avg_period);
P(se.avg.load_avg_contrib);
+ P(se.avg.utilization_avg_contrib);
P(se.avg.decay_count);
#endif
P(policy);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..ffeaa4105e48 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -670,17 +670,18 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
{
u32 slice;
- p->se.avg.decay_count = 0;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = slice;
- p->se.avg.runnable_avg_period = slice;
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+ p->se.avg.avg_period = slice;
__update_task_entity_contrib(&p->se);
+ __update_task_entity_utilization(&p->se);
}
#else
void init_task_runnable_average(struct task_struct *p)
@@ -1197,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
- long imb, old_imb;
- long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
+ long orig_src_load;
+ long load_a, load_b;
+ long moved_load;
+ long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1212,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- if (dst_load < src_load)
- swap(dst_load, src_load);
+ load_a = dst_load;
+ load_b = src_load;
+ if (load_a < load_b)
+ swap(load_a, load_b);
/* Is the difference below the threshold? */
- imb = dst_load * src_capacity * 100 -
- src_load * dst_capacity * env->imbalance_pct;
+ imb = load_a * src_capacity * 100 -
+ load_b * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Compare it with the old imbalance.
+ * Allow a move that brings us closer to a balanced situation,
+ * without moving things past the point of balance.
*/
orig_src_load = env->src_stats.load;
- orig_dst_load = env->dst_stats.load;
- if (orig_dst_load < orig_src_load)
- swap(orig_dst_load, orig_src_load);
-
- old_imb = orig_dst_load * src_capacity * 100 -
- orig_src_load * dst_capacity * env->imbalance_pct;
+ /*
+ * In a task swap, there will be one load moving from src to dst,
+ * and another moving back. This is the net sum of both moves.
+ * A simple task move will always have a positive value.
+ * Allow the move if it brings the system closer to a balanced
+ * situation, without crossing over the balance point.
+ */
+ moved_load = orig_src_load - src_load;
- /* Would this change make things worse? */
- return (imb > old_imb);
+ if (moved_load > 0)
+ /* Moving src -> dst. Did we overshoot balance? */
+ return src_load * dst_capacity < dst_load * src_capacity;
+ else
+ /* Moving dst -> src. Did we overshoot balance? */
+ return dst_load * src_capacity < src_load * dst_capacity;
}
/*
@@ -1610,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
- * to automatic numa balancing. Scan slower
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
*/
- if (local + shared == 0) {
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
@@ -1674,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.runnable_avg_period;
+ *period = p->se.avg.avg_period;
}
p->last_sum_exec_runtime = runtime;
@@ -1730,7 +1744,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
- nodemask_t max_group;
+ nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
@@ -1764,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
}
}
/* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
nodes = max_group;
}
return nid;
@@ -2081,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
@@ -2162,8 +2180,10 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma)) {
continue;
+ }
/*
* Shared library pages mapped by multiple processes are not
@@ -2498,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now,
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
delta = now - sa->last_runnable_update;
/*
@@ -2526,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->last_runnable_update = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->runnable_avg_period % 1024;
+ delta_w = sa->avg_period % 1024;
if (delta + delta_w >= 1024) {
/* period roll-over */
decayed = 1;
@@ -2539,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
delta_w = 1024 - delta_w;
if (runnable)
sa->runnable_avg_sum += delta_w;
- sa->runnable_avg_period += delta_w;
+ if (running)
+ sa->running_avg_sum += delta_w * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta_w;
delta -= delta_w;
@@ -2549,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
+ periods + 1);
+ sa->avg_period = decay_load(sa->avg_period,
periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
- sa->runnable_avg_period += runnable_contrib;
+ if (running)
+ sa->running_avg_sum += runnable_contrib * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
if (runnable)
sa->runnable_avg_sum += delta;
- sa->runnable_avg_period += delta;
+ if (running)
+ sa->running_avg_sum += delta * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta;
return decayed;
}
@@ -2574,11 +2607,13 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
+ se->avg.decay_count = 0;
if (!decays)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.decay_count = 0;
+ se->avg.utilization_avg_contrib =
+ decay_load(se->avg.utilization_avg_contrib, decays);
return decays;
}
@@ -2614,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->runnable_avg_period + 1);
+ sa->avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
@@ -2667,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+ runnable, runnable);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -2685,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.runnable_avg_period + 1);
+ contrib /= (se->avg.avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
}
@@ -2704,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
return se->avg.load_avg_contrib - old_contrib;
}
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.utilization_avg_contrib;
+
+ if (entity_is_task(se))
+ __update_task_entity_utilization(se);
+ else
+ se->avg.utilization_avg_contrib =
+ group_cfs_rq(se)->utilization_load_avg;
+
+ return se->avg.utilization_avg_contrib - old_contrib;
+}
+
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
long load_contrib)
{
@@ -2720,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
+ long contrib_delta, utilization_delta;
+ int cpu = cpu_of(rq_of(cfs_rq));
u64 now;
/*
@@ -2732,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
return;
contrib_delta = __update_entity_load_avg_contrib(se);
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
if (!update_cfs_rq)
return;
- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ cfs_rq->utilization_load_avg += utilization_delta;
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
}
/*
@@ -2818,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -2836,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -3173,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -4299,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -4712,6 +4785,33 @@ next:
done:
return target;
}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -5157,7 +5257,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
set_skip_buddy(se);
@@ -5838,12 +5938,12 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5914,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_capacity(sd, cpu);
-}
-
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
@@ -5940,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available, age_stamp, avg;
+ u64 total, used, age_stamp, avg;
s64 delta;
/*
@@ -5949,26 +6039,19 @@ static unsigned long scale_rt_capacity(int cpu)
*/
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
- delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
total = sched_avg_period() + delta;
- if (unlikely(total < avg)) {
- /* Ensures that capacity won't end up being negative */
- available = 0;
- } else {
- available = total - avg;
- }
+ used = div_u64(avg, total);
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
- total = SCHED_CAPACITY_SCALE;
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
- total >>= SCHED_CAPACITY_SHIFT;
-
- return div_u64(available, total);
+ return 1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -5983,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity >>= SCHED_CAPACITY_SHIFT;
- sdg->sgc->capacity_orig = capacity;
-
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_freq_capacity(sd, cpu);
- else
- capacity *= default_scale_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -6006,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6018,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -6038,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -6061,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
*/
static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
}
/*
@@ -6131,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+ return false;
+}
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
- return capacity_factor;
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
+
+ return false;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
@@ -6199,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
+ sgs->group_usage += get_cpu_usage(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
@@ -6221,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6347,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity) {
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
- sgs->group_type = group_classify(sg, sgs);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -6538,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6605,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6625,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
@@ -6685,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6714,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6724,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
@@ -6772,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -6871,6 +6961,9 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
@@ -6880,8 +6973,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -7581,22 +7672,25 @@ end:
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
- * - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's capacity.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline int nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
+ bool kick = false;
if (unlikely(rq->idle_balance))
- return 0;
+ return false;
/*
* We may be recently in ticked or tickless idle mode. At the first
@@ -7610,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return 0;
+ return false;
if (time_before(now, nohz.next_balance))
- return 0;
+ return false;
if (rq->nr_running >= 2)
- goto need_kick;
+ return true;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
-
if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
- if (nr_busy > 1)
- goto need_kick_unlock;
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu))
- goto need_kick_unlock;
-
- rcu_read_unlock();
- return 0;
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
-need_kick_unlock:
+unlock:
rcu_read_unlock();
-need_kick:
- return 1;
+ return kick;
}
#else
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
@@ -7657,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
- rebalance_domains(this_rq, idle);
-
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
- * stopped.
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 90284d117fe6..91e33cd485f6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..4d207d2abcbd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!tif_need_resched())
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -80,6 +82,7 @@ static void cpuidle_idle_call(void)
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
unsigned int broadcast;
+ bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -103,25 +106,37 @@ static void cpuidle_idle_call(void)
*/
rcu_idle_enter();
+ if (cpuidle_not_available(drv, dev))
+ goto use_default;
+
/*
- * Ask the cpuidle framework to choose a convenient idle state.
- * Fall back to the default arch idle method on errors.
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
*/
- next_state = cpuidle_select(drv, dev);
- if (next_state < 0) {
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state >= 0) {
local_irq_enable();
- else
- arch_cpu_idle();
+ goto exit_idle;
+ }
- goto exit_idle;
+ reflect = false;
+ next_state = cpuidle_find_deepest_state(drv, dev);
+ } else {
+ reflect = true;
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ */
+ next_state = cpuidle_select(drv, dev);
}
-
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0)
+ goto use_default;
/*
* The idle task must be scheduled, it is pointless to
@@ -143,8 +158,7 @@ use_default:
* is used from another cpu as a broadcast timer, this call may
* fail if it is not available
*/
- if (broadcast &&
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ if (broadcast && tick_broadcast_enter())
goto use_default;
/* Take note of the planned idle state. */
@@ -161,12 +175,13 @@ use_default:
idle_set_state(this_rq(), NULL);
if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+ tick_broadcast_exit();
/*
* Give the governor an opportunity to reflect on the outcome
*/
- cpuidle_reflect(dev, entered_state);
+ if (reflect)
+ cpuidle_reflect(dev, entered_state);
exit_idle:
__current_set_polling();
@@ -179,6 +194,19 @@ exit_idle:
rcu_idle_exit();
start_critical_timings();
+ return;
+
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..575da76a3874 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -6,6 +6,7 @@
#include "sched.h"
#include <linux/slab.h>
+#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
int i;
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
+#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
- init_rt_rq(rt_rq, cpu_rq(i));
+ init_rt_rq(rt_rq);
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
@@ -831,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
enqueue = 1;
/*
- * Force a clock update if the CPU was idle,
- * lest wakeup -> unthrottle time accumulate.
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq->skip_clock_update = -1;
+ rq_clock_skip_update(rq, false);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -1337,7 +1352,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
- if (target != -1)
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
rcu_read_unlock();
@@ -1614,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
lowest_rq = cpu_rq(cpu);
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
/* if the prio of this runqueue changed, try again */
if (double_lock_balance(rq, lowest_rq)) {
/*
@@ -1760,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq)
;
}
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+ /*
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+
+ } else if (cpu >= nr_cpu_ids) {
+ /*
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
+ */
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+ }
+ rq->rt.push_cpu = cpu;
+
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
+ }
+
+ return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu;
+
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
+
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+ irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
+ int cpu;
+
+ this_cpu = rt_rq->push_cpu;
+
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ /* Pass the IPI to the next rt overloaded queue */
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
+ }
+
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
@@ -1775,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return 0;
+ }
+#endif
+
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..e0e129993958 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
@@ -362,8 +363,14 @@ struct cfs_rq {
* Under CFS, load is tracked on a per-entity basis and aggregated up.
* This allows for the description of both thread and group usage (in
* the FAIR_GROUP_SCHED case).
+ * runnable_load_avg is the sum of the load_avg_contrib of the
+ * sched_entities on the rq.
+ * blocked_load_avg is similar to runnable_load_avg except that its
+ * the blocked sched_entities on the rq.
+ * utilization_load_avg is the sum of the average running time of the
+ * sched_entities on the rq.
*/
- unsigned long runnable_load_avg, blocked_load_avg;
+ unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
atomic64_t decay_counter;
u64 last_decay;
atomic_long_t removed_load;
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -435,7 +447,13 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
#endif
+#endif /* CONFIG_SMP */
int rt_queued;
int rt_throttled;
@@ -558,8 +576,6 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
- int skip_clock_update;
-
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -588,6 +604,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
+ unsigned int clock_skip_update;
u64 clock;
u64 clock_task;
@@ -598,6 +615,7 @@ struct rq {
struct sched_domain *sd;
unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
unsigned char idle_balance;
/* For active balancing */
@@ -687,16 +705,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+ lockdep_assert_held(&rq->lock);
+ if (skip)
+ rq->clock_skip_update |= RQCF_REQ_SKIP;
+ else
+ rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
@@ -789,7 +826,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1350,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq)
#ifdef CONFIG_SMP
extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
- rq->rt_avg += rt_delta;
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
sched_avg_update(rq);
}
#else
@@ -1362,6 +1408,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1549,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
- char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
- if (mask_str == NULL)
- return -ENOMEM;
if (v == (void *)1) {
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
}
- kfree(mask_str);
return 0;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
switch (action) {
case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
*/
SYSCALL_DEFINE0(restart_syscall)
{
- struct restart_block *restart = &current_thread_info()->restart_block;
+ struct restart_block *restart = &current->restart_block;
return restart->fn(restart);
}
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
set_restore_sigmask();
return -ERESTARTNOHAND;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f032fb5284e3..40190f28db35 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
+ get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
- if (preempt_count() == cnt)
+ if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
}
EXPORT_SYMBOL(__local_bh_disable_ip);
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch();
local_irq_enable();
- cond_resched();
+ cond_resched_rcu_qs();
return;
}
local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index a8c9f5a7dda6..a03d9cd23ed7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -97,6 +97,12 @@
#ifndef MPX_DISABLE_MANAGEMENT
# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
#endif
+#ifndef GET_FP_MODE
+# define GET_FP_MODE(a) (-EINVAL)
+#endif
+#ifndef SET_FP_MODE
+# define SET_FP_MODE(a,b) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -1102,6 +1108,7 @@ DECLARE_RWSEM(uts_sem);
/*
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
*/
static int override_release(char __user *release, size_t len)
{
@@ -1121,7 +1128,7 @@ static int override_release(char __user *release, size_t len)
break;
rest++;
}
- v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
+ v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
copy = clamp_t(size_t, len, 1, sizeof(buf));
copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
ret = copy_to_user(release, buf, copy + 1);
@@ -2210,11 +2217,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
error = MPX_ENABLE_MANAGEMENT(me);
break;
case PR_MPX_DISABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
error = MPX_DISABLE_MANAGEMENT(me);
break;
+ case PR_SET_FP_MODE:
+ error = SET_FP_MODE(me, arg2);
+ break;
+ case PR_GET_FP_MODE:
+ error = GET_FP_MODE(me);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..ce410bb9f2e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "nr_pdflush_threads",
.mode = 0444 /* read-only */,
.proc_handler = pdflush_proc_obsolete,
@@ -1248,7 +1256,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = &zero,
},
#ifdef CONFIG_NUMA
{
@@ -1257,7 +1264,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = &zero,
},
#endif
{
@@ -1280,7 +1286,6 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = &zero,
},
#endif
{
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
{
struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
void *reply = genlmsg_data(genlhdr);
- int rc;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return rc;
- }
+ genlmsg_end(skb, reply);
return genlmsg_reply(skb, info);
}
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
void *reply = genlmsg_data(genlhdr);
int rc, delcount = 0;
- rc = genlmsg_end(skb, reply);
- if (rc < 0) {
- nlmsg_free(skb);
- return;
- }
+ genlmsg_end(skb, reply);
rc = 0;
down_read(&listeners->sem);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d626dc98e8df..579ce1b929af 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET
config GENERIC_CLOCKEVENTS
bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
- bool
- default y
- depends on GENERIC_CLOCKEVENTS
-
# Architecture can handle broadcast in a driver-agnostic way
config ARCH_HAS_TICK_BROADCAST
bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..01f0312419b3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,16 +1,14 @@
obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
obj-y += tick-broadcast.o
obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
-obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..25d942d1da27 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ /* Transition with legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* Legacy callback doesn't support new modes */
+ if (state > CLOCK_EVT_STATE_ONESHOT)
+ return -ENOSYS;
+ /*
+ * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+ * mapping until *_ONESHOT, and so a simple cast will work.
+ */
+ dev->set_mode((enum clock_event_mode)state, dev);
+ dev->mode = (enum clock_event_mode)state;
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* Transition with new state-specific callbacks */
+ switch (state) {
+ case CLOCK_EVT_STATE_DETACHED:
+ /*
+ * This is an internal state, which is guaranteed to go from
+ * SHUTDOWN to DETACHED. No driver interaction required.
+ */
+ return 0;
+
+ case CLOCK_EVT_STATE_SHUTDOWN:
+ return dev->set_state_shutdown(dev);
+
+ case CLOCK_EVT_STATE_PERIODIC:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+ return -ENOSYS;
+ return dev->set_state_periodic(dev);
+
+ case CLOCK_EVT_STATE_ONESHOT:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return -ENOSYS;
+ return dev->set_state_oneshot(dev);
+
+ default:
+ return -ENOSYS;
+ }
+}
+
/**
- * clockevents_set_mode - set the operating mode of a clock event device
+ * clockevents_set_state - set the operating state of a clock event device
* @dev: device to modify
- * @mode: new mode
+ * @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_mode(struct clock_event_device *dev,
- enum clock_event_mode mode)
+void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->mode != mode) {
- dev->set_mode(mode, dev);
- dev->mode = mode;
+ if (dev->state != state) {
+ if (__clockevents_set_state(dev, state))
+ return;
+
+ dev->state = state;
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (mode == CLOCK_EVT_MODE_ONESHOT) {
+ if (state == CLOCK_EVT_STATE_ONESHOT) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
+/**
+ * clockevents_tick_resume - Resume the tick device before using it again
+ * @dev: device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+ int ret = 0;
+
+ if (dev->set_mode) {
+ dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+ dev->mode = CLOCK_EVT_MODE_RESUME;
+ } else if (dev->tick_resume) {
+ ret = dev->tick_resume(dev);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
/* Limit min_delta to a jiffie */
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
dev->retries++;
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
return 0;
/* Shortcut for clockevent devices that can deal with ktime. */
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+ if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
continue;
if (!tick_check_replacement(newdev, dev))
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+ if (ced->state == CLOCK_EVT_STATE_DETACHED) {
list_del_init(&ced->list);
return 0;
}
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
}
EXPORT_SYMBOL_GPL(clockevents_unbind);
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+ /* Legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* We shouldn't be supporting new modes now */
+ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+ dev->set_state_shutdown || dev->tick_resume);
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* New state-specific callbacks */
+ if (!dev->set_state_shutdown)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !dev->set_state_periodic)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !dev->set_state_oneshot)
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(clockevents_sanity_check(dev));
+
+ /* Initialize state to DETACHED */
+ dev->state = CLOCK_EVT_STATE_DETACHED;
+
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state == CLOCK_EVT_STATE_ONESHOT)
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+ return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev)
* @old: device to release (can be NULL)
* @new: device to request (can be NULL)
*
- * Called from the notifier chain. clockevents_lock is held already
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
*/
void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new)
{
- unsigned long flags;
-
- local_irq_save(flags);
/*
* Caller releases a clock event device. We queue it into the
* released list and do a notify add later.
*/
if (old) {
module_put(old->owner);
- clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+ clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
clockevents_shutdown(new);
}
- local_irq_restore(flags);
}
/**
@@ -541,74 +642,40 @@ void clockevents_resume(void)
dev->resume(dev);
}
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
+#ifdef CONFIG_HOTPLUG_CPU
/**
- * clockevents_notify - notification about relevant events
- * Returns 0 on success, any other value on error
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
-int clockevents_notify(unsigned long reason, void *arg)
+void tick_cleanup_dead_cpu(int cpu)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
- int cpu, ret = 0;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- switch (reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- tick_broadcast_on_off(reason, arg);
- break;
-
- case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
- case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
- ret = tick_broadcast_oneshot_control(reason);
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DYING:
- tick_handover_do_timer(arg);
- break;
-
- case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend();
- tick_suspend_broadcast();
- break;
-
- case CLOCK_EVT_NOTIFY_RESUME:
- tick_resume();
- break;
-
- case CLOCK_EVT_NOTIFY_CPU_DEAD:
- tick_shutdown_broadcast_oneshot(arg);
- tick_shutdown_broadcast(arg);
- tick_shutdown(arg);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ tick_shutdown_broadcast_oneshot(cpu);
+ tick_shutdown_broadcast(cpu);
+ tick_shutdown(cpu);
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
list_del(&dev->list);
- /*
- * Now check whether the CPU has left unused per cpu devices
- */
- cpu = *((int *)arg);
- list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
- if (cpumask_test_cpu(cpu, dev->cpumask) &&
- cpumask_weight(dev->cpumask) == 1 &&
- !tick_is_broadcast_device(dev)) {
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- list_del(&dev->list);
- }
}
- break;
- default:
- break;
}
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
- return ret;
}
-EXPORT_SYMBOL_GPL(clockevents_notify);
+#endif
#ifdef CONFIG_SYSFS
struct bus_type clockevents_subsys = {
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void)
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */
-
-#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..15facb1b9c60 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
#include "tick-internal.h"
#include "timekeeping_internal.h"
-void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
- u64 start_tstamp)
-{
- tc->cc = cc;
- tc->cycle_last = cc->read(cc);
- tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc: Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
- cycle_t cycle_now, cycle_delta;
- u64 ns_offset;
-
- /* read cycle counter: */
- cycle_now = tc->cc->read(tc->cc);
-
- /* calculate the delta since the last timecounter_read_delta(): */
- cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-
- /* update time stamp of timecounter_read_delta() call: */
- tc->cycle_last = cycle_now;
-
- return ns_offset;
-}
-
-u64 timecounter_read(struct timecounter *tc)
-{
- u64 nsec;
-
- /* increment time by nanoseconds since last call */
- nsec = timecounter_read_delta(tc);
- nsec += tc->nsec;
- tc->nsec = nsec;
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-
-u64 timecounter_cyc2time(struct timecounter *tc,
- cycle_t cycle_tstamp)
-{
- u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
- u64 nsec;
-
- /*
- * Instead of always treating cycle_tstamp as more recent
- * than tc->cycle_last, detect when it is too far in the
- * future and treat it as old time stamp instead.
- */
- if (cycle_delta > tc->cc->mask / 2) {
- cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
- nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
- } else {
- nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
- }
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
-
/**
* clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
* @mult: pointer to mult variable
@@ -218,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
schedule_work(&watchdog_work);
}
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
- printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
- cs->name, delta);
- __clocksource_unstable(cs);
-}
-
/**
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
@@ -250,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow, delta;
+ cycle_t csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -289,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
cs->cs_last = csnow;
cs->wd_last = wdnow;
@@ -297,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- clocksource_unstable(cs, cs_nsec - wd_nsec);
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
continue;
}
@@ -545,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* @shift: cycle to nanosecond divisor (power of two)
* @maxadj: maximum adjustment value to mult (~11%)
* @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc: maximum cycle value before potential overflow (does not include
+ * any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
*/
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
- * cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
- * which is equivalent to the below.
- * max_cycles < (2^63)/(mult + maxadj)
- * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
- * max_cycles < 2^(63 - log2(mult + maxadj))
- * max_cycles < 1 << (63 - log2(mult + maxadj))
- * Please note that we add 1 to the result of the log2 to account for
- * any rounding errors, ensure the above inequality is satisfied and
- * no overflow will occur.
+ * cyc2ns() function without overflowing a 64-bit result.
*/
- max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+ max_cycles = ULLONG_MAX;
+ do_div(max_cycles, mult+maxadj);
/*
* The actual maximum number of cycles we can defer the clocksource is
@@ -575,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
max_cycles = min(max_cycles, mask);
max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+ /* return the max_cycles value as well if requested */
+ if (max_cyc)
+ *max_cyc = max_cycles;
+
+ /* Return 50% of the actual maximum, so we can detect bad values */
+ max_nsecs >>= 1;
+
return max_nsecs;
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs: Pointer to clocksource to be updated
*
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
{
- u64 max_nsecs;
-
- max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
- cs->mask);
- /*
- * To ensure that the clocksource does not wrap whilst we are idle,
- * limit the time the clocksource can be deferred by 12.5%. Please
- * note a margin of 12.5% is used because this can be computed with
- * a shift, versus say 10% which would require division.
- */
- return max_nsecs - (max_nsecs >> 3);
+ cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+ cs->maxadj, cs->mask,
+ &cs->max_cycles);
}
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -724,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs)
}
/**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
@@ -732,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs)
* This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
*/
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
+
/*
- * Calc the maximum number of seconds which we can run before
- * wrapping around. For clocksources which have a mask > 32bit
- * we need to limit the max sleep time to have a good
- * conversion precision. 10 minutes is still a reasonable
- * amount. That results in a shift value of 24 for a
- * clocksource with mask >= 40bit and f >= 4GHz. That maps to
- * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
- * margin as we do in clocksource_max_deferment()
+ * Default clocksources are *special* and self-define their mult/shift.
+ * But, you're not special, so you should specify a freq value.
*/
- sec = (cs->mask - (cs->mask >> 3));
- do_div(sec, freq);
- do_div(sec, scale);
- if (!sec)
- sec = 1;
- else if (sec > 600 && cs->mask > UINT_MAX)
- sec = 600;
-
- clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
- NSEC_PER_SEC / scale, sec * scale);
-
+ if (freq) {
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32-bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP.
+ */
+ sec = cs->mask;
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+ }
/*
- * for clocksources that have large mults, to avoid overflow.
- * Since mult may be adjusted by ntp, add an safety extra margin
- *
+ * Ensure clocksources that have large 'mult' values don't overflow
+ * when adjusted.
*/
cs->maxadj = clocksource_max_adjustment(cs);
- while ((cs->mult + cs->maxadj < cs->mult)
- || (cs->mult - cs->maxadj > cs->mult)) {
+ while (freq && ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult))) {
cs->mult >>= 1;
cs->shift--;
cs->maxadj = clocksource_max_adjustment(cs);
}
- cs->max_idle_ns = clocksource_max_deferment(cs);
+ /*
+ * Only warn for *special* clocksources that self-define
+ * their mult/shift values and don't specify a freq.
+ */
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ clocksource_update_max_deferment(cs);
+
+ pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
@@ -790,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
/* Initialize mult/shift and max_idle_ns */
- __clocksource_updatefreq_scale(cs, scale, freq);
+ __clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
@@ -802,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs: clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
- /* calculate max adjustment for given mult/shift */
- cs->maxadj = clocksource_max_adjustment(cs);
- WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
- "Clocksource %s might overflow on 11%% adjustment\n",
- cs->name);
-
- /* calculate max idle time permitted for this clocksource */
- cs->max_idle_ns = clocksource_max_deferment(cs);
-
- mutex_lock(&clocksource_mutex);
- clocksource_enqueue(cs);
- clocksource_enqueue_watchdog(cs);
- clocksource_select();
- mutex_unlock(&clocksource_mutex);
- return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
list_del(&cs->list);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..76d4bd962b19 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,7 +54,7 @@
#include <trace/events/timer.h>
-#include "timekeeping.h"
+#include "tick-internal.h"
/*
* The timer bases:
@@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
boot = ktime_add(mono, off_boot);
xtim = ktime_add(mono, off_real);
- tai = ktime_add(xtim, off_tai);
+ tai = ktime_add(mono, off_tai);
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/*
* Divide a ktime value by a nanosecond value
*/
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
{
u64 dclc;
int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
return dclc;
}
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
/*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
trace_hrtimer_cancel(timer);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+ int i;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct timerqueue_node *next;
+ struct hrtimer *timer;
+
+ next = timerqueue_getnext(&base->active);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ }
+ /*
+ * clock_was_set() might have changed base->offset of any of
+ * the clock bases so the result might be negative. Fix it up
+ * to prevent a false positive in clockevents_program_event().
+ */
+ if (expires_next.tv64 < 0)
+ expires_next.tv64 = 0;
+ return expires_next;
+}
+#endif
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- int i;
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t expires, expires_next;
-
- expires_next.tv64 = KTIME_MAX;
-
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
- timer = container_of(next, struct hrtimer, node);
-
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- /*
- * clock_was_set() has changed base->offset so the
- * result might be negative. Fix it up to prevent a
- * false positive in clockevents_program_event()
- */
- if (expires.tv64 < 0)
- expires.tv64 = 0;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
- }
+ ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return 0;
/*
+ * When the target cpu of the timer is currently executing
+ * hrtimer_interrupt(), then we do not touch the clock event
+ * device. hrtimer_interrupt() will reevaluate all clock bases
+ * before reprogramming the device.
+ */
+ if (cpu_base->in_hrtirq)
+ return 0;
+
+ /*
* If a hang was detected in the last timer interrupt then we
* do not schedule a timer which is earlier than the expiry
* which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
ktime_t hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *base = cpu_base->clock_base;
- ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+ ktime_t mindelta = { .tv64 = KTIME_MAX };
unsigned long flags;
- int i;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active()) {
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
- struct hrtimer *timer;
- struct timerqueue_node *next;
-
- next = timerqueue_getnext(&base->active);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- delta.tv64 = hrtimer_get_expires_tv64(timer);
- delta = ktime_sub(delta, base->get_time());
- if (delta.tv64 < mindelta.tv64)
- mindelta.tv64 = delta.tv64;
- }
- }
+ if (!hrtimer_hres_active())
+ mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+ ktime_get());
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
raw_spin_lock(&cpu_base->lock);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- expires_next.tv64 = KTIME_MAX;
+ cpu_base->in_hrtirq = 1;
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
-
- if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (expires.tv64 < 0)
- expires.tv64 = KTIME_MAX;
- if (expires.tv64 < expires_next.tv64)
- expires_next = expires;
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
break;
- }
__run_hrtimer(timer, &basenow);
}
}
-
+ /* Reevaluate the clock bases for the next expiry */
+ expires_next = __hrtimer_get_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
+ cpu_base->in_hrtirq = 0;
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
goto out;
}
- restart = &current_thread_info()->restart_block;
+ restart = &current->restart_block;
restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
@@ -1715,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
- break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- {
- clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
migrate_hrtimers(scpu);
break;
- }
#endif
default:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..347fecf86a3f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
#include <linux/init.h>
-#include "tick-internal.h"
+#include "timekeeping.h"
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
static int __init init_jiffies_clocksource(void)
{
- return clocksource_register(&clocksource_jiffies);
+ return __clocksource_register(&clocksource_jiffies);
}
core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
- clocksource_register(&refined_jiffies);
+ __clocksource_register(&refined_jiffies);
return 0;
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..7a681003001c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/rtc.h>
-#include "tick-internal.h"
#include "ntp_internal.h"
/*
@@ -459,6 +458,16 @@ out:
return leap;
}
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+ struct timespec now;
+
+ now = timespec64_to_timespec(now64);
+ return update_persistent_clock(now);
+}
+#endif
+
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_cmos_clock(struct work_struct *work);
@@ -488,14 +497,15 @@ static void sync_cmos_clock(struct work_struct *work)
getnstimeofday64(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
- struct timespec adjust = timespec64_to_timespec(now);
+ struct timespec64 adjust = now;
fail = -ENODEV;
if (persistent_clock_is_local)
adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
- fail = update_persistent_clock(adjust);
+ fail = update_persistent_clock64(adjust);
#endif
+
#ifdef CONFIG_RTC_SYSTOHC
if (fail == -ENODEV)
fail = rtc_set_ntp_time(adjust);
@@ -633,6 +643,17 @@ int ntp_validate_timex(struct timex *txc)
if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
return -EPERM;
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
struct timespec *rqtp, struct timespec __user *rmtp)
{
- struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ struct restart_block *restart_block = &current->restart_block;
struct itimerspec it;
int error;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
/*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ * hardware time counters to full 64-bit ns values.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
#include <linux/seqlock.h>
#include <linux/bitops.h>
-struct clock_data {
- ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns: sched_clock() value at last update
+ * @epoch_cyc: Clock cycle value at last update.
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multipler for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
u64 epoch_ns;
u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
+ u64 sched_clock_mask;
+ u64 (*read_sched_clock)(void);
u32 mult;
u32 shift;
- bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_t seq;
+ struct clock_read_data read_data[2];
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
};
static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
- .mult = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
static u64 notrace jiffy_sched_clock_read(void)
{
/*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+ .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read, },
+ .actual_read_sched_clock = jiffy_sched_clock_read,
+};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 cyc;
+ u64 cyc, res;
unsigned long seq;
-
- if (cd.suspended)
- return cd.epoch_ns;
+ struct clock_read_data *rd;
do {
- seq = raw_read_seqcount_begin(&cd.seq);
- epoch_cyc = cd.epoch_cyc;
- epoch_ns = cd.epoch_ns;
+ seq = raw_read_seqcount(&cd.seq);
+ rd = cd.read_data + (seq & 1);
+
+ cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+ rd->sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
} while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock();
- cyc = (cyc - epoch_cyc) & sched_clock_mask;
- return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+ return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ /* steer readers towards the odd copy */
+ raw_write_seqcount_latch(&cd.seq);
+
+ /* now its safe for us to update the normal (even) copy */
+ cd.read_data[0] = *rd;
+
+ /* switch readers back to the even copy */
+ raw_write_seqcount_latch(&cd.seq);
}
/*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
*/
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
{
- unsigned long flags;
u64 cyc;
u64 ns;
+ struct clock_read_data rd;
+
+ rd = cd.read_data[0];
+
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+ rd.epoch_ns = ns;
+ rd.epoch_cyc = cyc;
- cyc = read_sched_clock();
- ns = cd.epoch_ns +
- cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
-
- raw_local_irq_save(flags);
- raw_write_seqcount_begin(&cd.seq);
- cd.epoch_ns = ns;
- cd.epoch_cyc = cyc;
- raw_write_seqcount_end(&cd.seq);
- raw_local_irq_restore(flags);
+ update_clock_read_data(&rd);
}
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
update_sched_clock();
hrtimer_forward_now(hrt, cd.wrap_kt);
+
return HRTIMER_RESTART;
}
-void __init sched_clock_register(u64 (*read)(void), int bits,
- unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- ktime_t new_wrap_kt;
unsigned long r;
char r_unit;
+ struct clock_read_data rd;
if (cd.rate > rate)
return;
WARN_ON(!irqs_disabled());
- /* calculate the mult/shift to convert counter ticks to ns. */
+ /* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits);
+ cd.rate = rate;
+
+ /* Calculate how many nanosecs until we risk wrapping */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+ cd.wrap_kt = ns_to_ktime(wrap);
- /* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
- new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0];
- /* update epoch for new counter and update epoch_ns from old counter*/
+ /* Update epoch for new counter and update 'epoch_ns' from old counter*/
new_epoch = read();
- cyc = read_sched_clock();
- ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq);
- read_sched_clock = read;
- sched_clock_mask = new_mask;
- cd.rate = rate;
- cd.wrap_kt = new_wrap_kt;
- cd.mult = new_mult;
- cd.shift = new_shift;
- cd.epoch_cyc = new_epoch;
- cd.epoch_ns = ns;
- raw_write_seqcount_end(&cd.seq);
+ rd.read_sched_clock = read;
+ rd.sched_clock_mask = new_mask;
+ rd.mult = new_mult;
+ rd.shift = new_shift;
+ rd.epoch_cyc = new_epoch;
+ rd.epoch_ns = ns;
+
+ update_clock_read_data(&rd);
r = rate;
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
- } else if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else
- r_unit = ' ';
-
- /* calculate the ns resolution of this counter */
+ } else {
+ if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else {
+ r_unit = ' ';
+ }
+ }
+
+ /* Calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, new_mult, new_shift);
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- /* Enable IRQ time accounting if we have a fast enough sched_clock */
+ /* Enable IRQ time accounting if we have a fast enough sched_clock() */
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
void __init sched_clock_postinit(void)
{
/*
- * If no sched_clock function has been provided at that point,
+ * If no sched_clock() function has been provided at that point,
* make it the final one one.
*/
- if (read_sched_clock == jiffy_sched_clock_read)
+ if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
}
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+ unsigned long seq = raw_read_seqcount(&cd.seq);
+
+ return cd.read_data[seq & 1].epoch_cyc;
+}
+
static int sched_clock_suspend(void)
{
+ struct clock_read_data *rd = &cd.read_data[0];
+
update_sched_clock();
hrtimer_cancel(&sched_clock_timer);
- cd.suspended = true;
+ rd->read_sched_clock = suspended_sched_clock_read;
+
return 0;
}
static void sched_clock_resume(void)
{
- cd.epoch_cyc = read_sched_clock();
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ rd->epoch_cyc = cd.actual_read_sched_clock();
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
- cd.suspended = false;
+ rd->read_sched_clock = cd.actual_read_sched_clock;
}
static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
};
static int __init sched_clock_syscore_init(void)
{
register_syscore_ops(&sched_clock_ops);
+
return 0;
}
device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index eb682d5c697c..6aac4beedbbe 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode,
*/
static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
{
+ int bc_moved;
/*
* We try to cancel the timer first. If the callback is on
* flight on some other cpu then we let it handle it. If we
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* restart the timer because we are in the callback, but we
* can set the expiry time and let the callback return
* HRTIMER_RESTART.
+ *
+ * Since we are in the idle loop at this point and because
+ * hrtimer_{start/cancel} functions call into tracing,
+ * calls to these functions must be bound within RCU_NONIDLE.
*/
- if (hrtimer_try_to_cancel(&bctimer) >= 0) {
- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+ RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
+ !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+ 0);
+ if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
} else if (bc->bound_on == smp_processor_id()) {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec05e48..7e8ca4f448a8 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask;
static cpumask_var_t tick_broadcast_on;
static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
-static int tick_broadcast_force;
+static int tick_broadcast_forced;
#ifdef CONFIG_TICK_ONESHOT
static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
#else
static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
#endif
/*
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* The device is in periodic mode. No reprogramming necessary:
*/
- if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
goto unlock;
/*
@@ -324,49 +326,54 @@ unlock:
raw_spin_unlock(&tick_broadcast_lock);
}
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode: The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-static void tick_do_broadcast_on_off(unsigned long *reason)
+void tick_broadcast_control(enum tick_broadcast_mode mode)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
int cpu, bc_stopped;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
- bc = tick_broadcast_device.evtdev;
/*
* Is the device not affected by the powerstate ?
*/
if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
if (!tick_device_is_functional(dev))
- goto out;
+ return;
+ raw_spin_lock(&tick_broadcast_lock);
+ cpu = smp_processor_id();
+ bc = tick_broadcast_device.evtdev;
bc_stopped = cpumask_empty(tick_broadcast_mask);
- switch (*reason) {
- case CLOCK_EVT_NOTIFY_BROADCAST_ON:
- case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ switch (mode) {
+ case TICK_BROADCAST_FORCE:
+ tick_broadcast_forced = 1;
+ case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
- if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
- tick_broadcast_force = 1;
break;
- case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (tick_broadcast_force)
+
+ case TICK_BROADCAST_OFF:
+ if (tick_broadcast_forced)
break;
cpumask_clear_cpu(cpu, tick_broadcast_on);
if (!tick_device_is_functional(dev))
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
else
tick_broadcast_setup_oneshot(bc);
}
-out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop.
- */
-void tick_broadcast_on_off(unsigned long reason, int *oncpu)
-{
- if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
- printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
- "offline CPU #%d\n", *oncpu);
- else
- tick_do_broadcast_on_off(&reason);
+ raw_spin_unlock(&tick_broadcast_lock);
}
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
/*
* Set the periodic handler depending on broadcast on/off
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
dev->event_handler = tick_handle_periodic_broadcast;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Remove a CPU from broadcasting
*/
-void tick_shutdown_broadcast(unsigned int *cpup)
+void tick_shutdown_broadcast(unsigned int cpu)
{
struct clock_event_device *bc;
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
void tick_suspend_broadcast(void)
{
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+ return false;
+ else
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
- int broadcast = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
if (bc) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
+ clockevents_tick_resume(bc);
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
- broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
if (!cpumask_empty(tick_broadcast_mask))
- broadcast = tick_resume_broadcast_oneshot(bc);
+ tick_resume_broadcast_oneshot(bc);
break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
- return broadcast;
}
-
#ifdef CONFIG_TICK_ONESHOT
static cpumask_var_t tick_broadcast_oneshot_mask;
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
{
int ret;
- if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
ret = clockevents_program_event(bc, expires, force);
if (!ret)
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
return ret;
}
-int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- return 0;
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_mode(td->evtdev,
- CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(td->evtdev,
+ CLOCK_EVT_STATE_ONESHOT);
}
}
}
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-static void broadcast_move_bc(int deadcpu)
-{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
-
- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
-}
-
-/*
- * Powerstate information: The system enters/leaves a state, where
- * affected devices might stop
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
* Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
*/
-int tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
struct tick_device *td;
- unsigned long flags;
- ktime_t now;
int cpu, ret = 0;
+ ktime_t now;
/*
* Periodic mode does not care about the enter/exit of power
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason)
* We are called with preemtion disabled from the depth of the
* idle code, so we can't be moved away.
*/
- cpu = smp_processor_id();
- td = &per_cpu(tick_cpu_device, cpu);
+ td = this_cpu_ptr(&tick_cpu_device);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
+ raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
- if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+ if (state == TICK_BROADCAST_ENTER) {
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
broadcast_shutdown_local(bc, dev);
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason)
}
}
out:
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Reset the one shot broadcast for a cpu
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void)
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this CPU: */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
/*
* Remove a dead CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
{
unsigned long flags;
- unsigned int cpu = *cpup;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
- broadcast_move_bc(cpu);
-
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+#endif
/*
* Check, whether the broadcast device is in one shot mode
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..3ae6afa1eb98 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ if (dev->state != CLOCK_EVT_STATE_ONESHOT)
return;
for (;;) {
/*
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -332,14 +332,16 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled.
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
-void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(void)
{
- if (*cpup == tick_do_timer_cpu) {
+ if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup)
* access the hardware device itself.
* We just set the mode and remove it from the lists.
*/
-void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int cpu)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
+ dev->state = CLOCK_EVT_STATE_DETACHED;
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
}
+#endif
-void tick_suspend(void)
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
clockevents_shutdown(td->evtdev);
}
-void tick_resume(void)
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
{
struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
- int broadcast = tick_resume_broadcast();
-
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
+ bool broadcast = tick_resume_check_broadcast();
+ clockevents_tick_resume(td->evtdev);
if (!broadcast) {
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(td->evtdev, 0);
@@ -395,6 +412,83 @@ void tick_resume(void)
}
/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+ tick_suspend_local();
+ tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+ tick_resume_broadcast();
+ tick_resume_local();
+}
+
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping. Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ tick_freeze_depth++;
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_suspend();
+ else
+ tick_suspend_local();
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping. Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_resume();
+ else
+ tick_resume_local();
+
+ tick_freeze_depth--;
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
* tick_init - initialize the tick control
*/
void __init tick_init(void)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4f2c66..b64fdd8054c5 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,15 +5,12 @@
#include <linux/tick.h>
#include "timekeeping.h"
+#include "tick-sched.h"
-extern seqlock_t jiffies_lock;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-#define CS_NAME_LEN 32
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
-
-#define TICK_DO_TIMER_NONE -1
-#define TICK_DO_TIMER_BOOT -2
+# define TICK_DO_TIMER_NONE -1
+# define TICK_DO_TIMER_BOOT -2
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_handover_do_timer(int *cpup);
-extern void tick_shutdown(unsigned int *cpup);
+extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
struct clock_event_device *newdev);
extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
-extern void clockevents_shutdown(struct clock_event_device *dev);
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+ ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-/*
- * NO_HZ / high resolution timer shared code
- */
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
#ifdef CONFIG_TICK_ONESHOT
extern void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
extern void tick_resume_oneshot(void);
-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern int tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
-extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
-# else /* BROADCAST */
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
-static inline bool tick_broadcast_oneshot_available(void) { return true; }
-# endif /* !BROADCAST */
-
-#else /* !ONESHOT */
-static inline
-void tick_setup_oneshot(struct clock_event_device *newdev,
- void (*handler)(struct clock_event_device *),
- ktime_t nextevt)
-{
- BUG();
-}
-static inline void tick_resume_oneshot(void)
-{
- BUG();
-}
-static inline int tick_program_event(ktime_t expires, int force)
-{
- return 0;
-}
-static inline void tick_oneshot_notify(void) { }
-static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
-{
- BUG();
-}
-static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
-static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
-{
- return 0;
-}
-static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline bool tick_broadcast_oneshot_available(void) { return false; }
-#endif /* !TICK_ONESHOT */
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void);
# else
static inline void tick_nohz_init(void) { }
#endif
-
-/*
- * Broadcasting support
- */
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern void tick_install_broadcast_device(struct clock_event_device *dev);
-extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
-extern void tick_shutdown_broadcast(unsigned int *cpup);
-extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
-extern void tick_broadcast_init(void);
-extern void
-tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
-int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
-
-#else /* !BROADCAST */
-
-static inline void tick_install_broadcast_device(struct clock_event_device *dev)
-{
-}
-
-static inline int tick_is_broadcast_device(struct clock_event_device *dev)
-{
- return 0;
-}
-static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
- int cpu)
-{
- return 0;
-}
-static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
-static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
-static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
-static inline void tick_broadcast_init(void) { }
-static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
- u32 freq) { return -ENODEV; }
-
-/*
- * Set the periodic handler in non broadcast mode
- */
-static inline void tick_set_periodic_handler(struct clock_event_device *dev,
- int broadcast)
-{
- dev->event_handler = tick_handle_periodic;
-}
-#endif /* !BROADCAST */
-
-/*
- * Check, if the device is functional or a dummy for broadcast
- */
-static inline int tick_device_is_functional(struct clock_event_device *dev)
-{
- return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
-}
-
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-
-#endif
-
-extern void do_timer(unsigned long ticks);
-extern void update_wall_time(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 7ce740e78e1b..67a64b1670bf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1363d58f07e9..914259128145 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -34,7 +34,7 @@
/*
* Per cpu nohz control structure
*/
-DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-/*
- * Worst case string length in chunks of CPU range seems 2 steps
- * separations: 0,2,4,6,...
- * This is NR_CPUS + sizeof('\0')
- */
-static char __initdata nohz_full_buf[NR_CPUS + 1];
-
static int tick_nohz_init_all(void)
{
int err = -1;
@@ -393,8 +386,8 @@ void __init tick_nohz_init(void)
context_tracking_cpu_set(cpu);
cpu_notifier(tick_nohz_cpu_down_callback, 0);
- cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
- pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+ pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
+ cpumask_pr_args(tick_nohz_full_mask));
}
#endif
@@ -423,6 +416,11 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+int tick_nohz_tick_stopped(void)
+{
+ return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000000..28b5da3e1a17
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+ TICKDEV_MODE_PERIODIC,
+ TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+ struct clock_event_device *evtdev;
+ enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+ NOHZ_MODE_INACTIVE,
+ NOHZ_MODE_LOWRES,
+ NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+ struct hrtimer sched_timer;
+ unsigned long check_clocks;
+ enum tick_nohz_mode nohz_mode;
+ ktime_t last_tick;
+ int inidle;
+ int tick_stopped;
+ unsigned long idle_jiffies;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+ int idle_active;
+ ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+ ktime_t sleep_length;
+ unsigned long last_jiffies;
+ unsigned long next_jiffies;
+ ktime_t idle_expires;
+ int do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6390517e77d4..2c85b7724af4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
if (tv) {
if (copy_from_user(&user_tv, tv, sizeof(*tv)))
return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+ tc->mask = (1ULL << cc->shift) - 1;
+ tc->frac = 0;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+ tc->mask, &tc->frac);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+ cycle_t cycles, u64 mask, u64 frac)
+{
+ u64 ns = (u64) cycles;
+
+ ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+ return ns;
+}
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec = tc->nsec, frac = tc->frac;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (delta > tc->cc->mask / 2) {
+ delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
+ } else {
+ nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..946acb72179f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -59,17 +59,15 @@ struct tk_fast {
};
static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw ____cacheline_aligned;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
-/* Flag for if there is a persistent clock on this platform */
-bool __read_mostly persistent_clock_exist = false;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
- tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+ while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+ tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
}
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
struct timespec64 ts;
ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
return ts;
}
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
}
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+ cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ const char *name = tk->tkr_mono.clock->name;
+
+ if (offset > max_cycles) {
+ printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+ offset, name, max_cycles);
+ printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+ } else {
+ if (offset > (max_cycles >> 1)) {
+ printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+ offset, name, max_cycles >> 1);
+ printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+ }
+ }
+
+ if (timekeeping_underflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_underflow_seen = 0;
+ }
+
+ if (timekeeping_overflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_overflow_seen = 0;
+ }
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t now, last, mask, max, delta;
+ unsigned int seq;
+
+ /*
+ * Since we're called holding a seqlock, the data may shift
+ * under us while we're doing the calculation. This can cause
+ * false positives, since we'd note a problem but throw the
+ * results away. So nest another seqlock here to atomically
+ * grab the points we are checking with.
+ */
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ now = tkr->read(tkr->clock);
+ last = tkr->cycle_last;
+ mask = tkr->mask;
+ max = tkr->clock->max_cycles;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ delta = clocksource_delta(now, last, mask);
+
+ /*
+ * Try to catch underflows by checking if we are seeing small
+ * mask-relative negative values.
+ */
+ if (unlikely((~delta & mask) < (mask >> 3))) {
+ timekeeping_underflow_seen = 1;
+ delta = 0;
+ }
+
+ /* Cap delta value to the max_cycles values to avoid mult overflows */
+ if (unlikely(delta > max)) {
+ timekeeping_overflow_seen = 1;
+ delta = tkr->clock->max_cycles;
+ }
+
+ return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t cycle_now, delta;
+
+ /* read clocksource */
+ cycle_now = tkr->read(tkr->clock);
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+ return delta;
+}
+#endif
+
/**
* tk_setup_internals - Set up internals to use clocksource clock.
*
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->tkr.clock;
- tk->tkr.clock = clock;
- tk->tkr.read = clock->read;
- tk->tkr.mask = clock->mask;
- tk->tkr.cycle_last = tk->tkr.read(clock);
+ old_clock = tk->tkr_mono.clock;
+ tk->tkr_mono.clock = clock;
+ tk->tkr_mono.read = clock->read;
+ tk->tkr_mono.mask = clock->mask;
+ tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+ tk->tkr_raw.clock = clock;
+ tk->tkr_raw.read = clock->read;
+ tk->tkr_raw.mask = clock->mask;
+ tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->tkr.xtime_nsec >>= -shift_change;
+ tk->tkr_mono.xtime_nsec >>= -shift_change;
else
- tk->tkr.xtime_nsec <<= shift_change;
+ tk->tkr_mono.xtime_nsec <<= shift_change;
}
- tk->tkr.shift = clock->shift;
+ tk->tkr_raw.xtime_nsec = 0;
+
+ tk->tkr_mono.shift = clock->shift;
+ tk->tkr_raw.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->tkr.mult = clock->mult;
+ tk->tkr_mono.mult = clock->mult;
+ tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
}
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; }
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t cycle_now, delta;
+ cycle_t delta;
s64 nsec;
- /* read clocksource: */
- cycle_now = tkr->read(tkr->clock);
-
- /* calculate the delta since the last update_wall_time: */
- delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+ delta = timekeeping_get_delta(tkr);
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
@@ -209,30 +323,9 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
return nsec + arch_gettimeoffset();
}
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
-{
- struct clocksource *clock = tk->tkr.clock;
- cycle_t cycle_now, delta;
- s64 nsec;
-
- /* read clocksource: */
- cycle_now = tk->tkr.read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-
- /* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
-}
-
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
- * @tk: The timekeeper from which we take the update
- * @tkf: The fast timekeeper to update
- * @tbase: The time base for the fast timekeeper (mono/raw)
+ * @tkr: Timekeeping readout base from which we take the update
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
@@ -244,11 +337,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* smp_wmb(); <- Ensure that the last base[1] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tk);
+ * update(tkf->base[0], tkr);
* smp_wmb(); <- Ensure that the base[0] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tk);
+ * update(tkf->base[1], tkr);
*
* The reader side does:
*
@@ -269,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* slightly wrong timestamp (a few nanoseconds). See
* @ktime_get_mono_fast_ns.
*/
-static void update_fast_timekeeper(struct timekeeper *tk)
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
{
- struct tk_read_base *base = tk_fast_mono.base;
+ struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tk_fast_mono.seq);
+ raw_write_seqcount_latch(&tkf->seq);
/* Update base[0] */
- memcpy(base, &tk->tkr, sizeof(*base));
+ memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tk_fast_mono.seq);
+ raw_write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
@@ -318,22 +411,67 @@ static void update_fast_timekeeper(struct timekeeper *tk)
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
-u64 notrace ktime_get_mono_fast_ns(void)
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
unsigned int seq;
u64 now;
do {
- seq = raw_read_seqcount(&tk_fast_mono.seq);
- tkr = tk_fast_mono.base + (seq & 0x01);
- now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+ seq = raw_read_seqcount(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ } while (read_seqcount_retry(&tkf->seq, seq));
- } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
return now;
}
+
+u64 ktime_get_mono_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_mono);
+}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+u64 ktime_get_raw_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended. It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+ static struct tk_read_base tkr_dummy;
+ struct tk_read_base *tkr = &tk->tkr_mono;
+
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ cycles_at_suspend = tkr->read(tkr->clock);
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+ tkr = &tk->tkr_raw;
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
+}
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall(struct timekeeper *tk)
@@ -342,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
xt = timespec64_to_timespec(tk_xtime(tk));
wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
- tk->tkr.cycle_last);
+ update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+ tk->tkr_mono.cycle_last);
}
static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -360,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
* users are removed, this can be killed.
*/
- remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
- tk->tkr.xtime_nsec -= remainder;
- tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
@@ -429,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
*/
seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
- tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
/* Update the monotonic raw base */
- tk->base_raw = timespec64_to_ktime(tk->raw_time);
+ tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
* this into account before updating tk->ktime_sec.
*/
- nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
@@ -462,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
- update_fast_timekeeper(tk);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
}
/**
@@ -474,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr.read(clock);
- delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
- tk->tkr.cycle_last = cycle_now;
+ cycle_now = tk->tkr_mono.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
- tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+ tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
timespec64_add_ns(&tk->raw_time, nsec);
}
@@ -510,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts)
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -550,8 +690,8 @@ ktime_t ktime_get(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -576,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = ktime_add(tk->tkr.base_mono, *offset);
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -618,8 +758,8 @@ ktime_t ktime_get_raw(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_raw;
- nsecs = timekeeping_get_ns_raw(tk);
+ base = tk->tkr_raw.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -647,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(&tk->tkr);
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
tomono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -732,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
- nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(&tk->tkr);
+ nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
+ nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -916,7 +1056,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->tkr.clock;
+ old = tk->tkr_mono.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -944,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock)
{
struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->tkr.clock == clock)
+ if (tk->tkr_mono.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->tkr.clock == clock ? 0 : -1;
+ return tk->tkr_mono.clock == clock ? 0 : -1;
}
/**
@@ -966,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- nsecs = timekeeping_get_ns_raw(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -989,7 +1129,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1008,7 +1148,7 @@ u64 timekeeping_max_deferment(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->tkr.clock->max_idle_ns;
+ ret = tk->tkr_mono.clock->max_idle_ns;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -1030,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
/**
* read_boot_clock - Return time of the system start.
*
@@ -1045,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts)
ts->tv_nsec = 0;
}
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_boot_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
@@ -1054,20 +1216,17 @@ void __init timekeeping_init(void)
struct clocksource *clock;
unsigned long flags;
struct timespec64 now, boot, tmp;
- struct timespec ts;
- read_persistent_clock(&ts);
- now = timespec_to_timespec64(ts);
+ read_persistent_clock64(&now);
if (!timespec64_valid_strict(&now)) {
pr_warn("WARNING: Persistent clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
now.tv_sec = 0;
now.tv_nsec = 0;
} else if (now.tv_sec || now.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
- read_boot_clock(&ts);
- boot = timespec_to_timespec64(ts);
+ read_boot_clock64(&boot);
if (!timespec64_valid_strict(&boot)) {
pr_warn("WARNING: Boot clock returned invalid value!\n"
" Check your CMOS/BIOS settings.\n");
@@ -1087,7 +1246,6 @@ void __init timekeeping_init(void)
tk_set_xtime(tk, &now);
tk->raw_time.tv_sec = 0;
tk->raw_time.tv_nsec = 0;
- tk->base_raw.tv64 = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1100,7 +1258,7 @@ void __init timekeeping_init(void)
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
-/* time in seconds when suspend began */
+/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;
/**
@@ -1125,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
tk_debug_account_sleep_time(delta);
}
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+ return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+ return persistent_clock_exists;
+}
+
/**
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec64 delta value
*
- * This hook is for architectures that cannot support read_persistent_clock
+ * This hook is for architectures that cannot support read_persistent_clock64
* because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
*
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
@@ -1140,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
- /*
- * Make sure we don't set the clock twice, as timekeeping_resume()
- * already did it
- */
- if (has_persistent_clock())
- return;
-
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
@@ -1162,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
/* signal hrtimers about time change */
clock_was_set();
}
+#endif
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
*/
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->tkr.clock;
+ struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
- struct timespec tmp;
cycle_t cycle_now, cycle_delta;
- bool suspendtime_found = false;
- read_persistent_clock(&tmp);
- ts_new = timespec_to_timespec64(tmp);
+ sleeptime_injected = false;
+ read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
@@ -1201,16 +1384,16 @@ static void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr.read(clock);
+ cycle_now = tk->tkr_mono.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > tk->tkr.cycle_last) {
+ cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
- tk->tkr.mask);
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1226,17 +1409,19 @@ static void timekeeping_resume(void)
nsec += ((u64) cycle_delta * mult) >> shift;
ts_delta = ns_to_timespec64(nsec);
- suspendtime_found = true;
+ sleeptime_injected = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
- suspendtime_found = true;
+ sleeptime_injected = true;
}
- if (suspendtime_found)
+ if (sleeptime_injected)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->tkr.cycle_last = cycle_now;
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1245,22 +1430,18 @@ static void timekeeping_resume(void)
touch_softlockup_watchdog();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
+ tick_resume();
hrtimers_resume();
}
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
struct timespec64 delta, delta_delta;
static struct timespec64 old_delta;
- struct timespec tmp;
- read_persistent_clock(&tmp);
- timekeeping_suspend_time = timespec_to_timespec64(tmp);
+ read_persistent_clock64(&timekeeping_suspend_time);
/*
* On some systems the persistent_clock can not be detected at
@@ -1268,38 +1449,41 @@ static int timekeeping_suspend(void)
* value returned, update the persistent_clock_exists flag.
*/
if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
- persistent_clock_exist = true;
+ persistent_clock_exists = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
- /*
- * To avoid drift caused by repeated suspend/resumes,
- * which each can add ~1 second drift error,
- * try to compensate so the difference in system time
- * and persistent_clock time stays close to constant.
- */
- delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
- delta_delta = timespec64_sub(delta, old_delta);
- if (abs(delta_delta.tv_sec) >= 2) {
+ if (persistent_clock_exists) {
/*
- * if delta_delta is too large, assume time correction
- * has occured and set old_delta to the current delta.
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
*/
- old_delta = delta;
- } else {
- /* Otherwise try to adjust old_system to compensate */
- timekeeping_suspend_time =
- timespec64_add(timekeeping_suspend_time, delta_delta);
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occurred and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec64_add(timekeeping_suspend_time, delta_delta);
+ }
}
timekeeping_update(tk, TK_MIRROR);
+ halt_fast_timekeeper(tk);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ tick_suspend();
clocksource_suspend();
clockevents_suspend();
@@ -1388,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
- if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE(1);
return;
}
- tk->tkr.mult += mult_adj;
+ tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
- tk->tkr.xtime_nsec -= offset;
+ tk->tkr_mono.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
}
@@ -1458,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
tk->ntp_err_mult = 0;
}
- if (unlikely(tk->tkr.clock->maxadj &&
- (abs(tk->tkr.mult - tk->tkr.clock->mult)
- > tk->tkr.clock->maxadj))) {
+ if (unlikely(tk->tkr_mono.clock->maxadj &&
+ (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+ > tk->tkr_mono.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->tkr.clock->name, (long)tk->tkr.mult,
- (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+ tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+ (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
}
/*
@@ -1481,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
- s64 neg = -(s64)tk->tkr.xtime_nsec;
- tk->tkr.xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+ tk->tkr_mono.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
}
@@ -1498,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
- while (tk->tkr.xtime_nsec >= nsecps) {
+ while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
- tk->tkr.xtime_nsec -= nsecps;
+ tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
@@ -1549,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->tkr.cycle_last += interval;
+ tk->tkr_mono.cycle_last += interval;
+ tk->tkr_raw.cycle_last += interval;
- tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1594,14 +1779,17 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
- tk->tkr.cycle_last, tk->tkr.mask);
+ offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval)
goto out;
+ /* Do some additional sanity checking */
+ timekeeping_check_update(real_tk, offset);
+
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
@@ -1659,24 +1847,24 @@ out:
}
/**
- * getboottime - Return the real time of system boot.
- * @ts: pointer to the timespec to be set
+ * getboottime64 - Return the real time of system boot.
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
* basically means that however wrong your real time clock is at boot time,
* you get the right time here).
*/
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
- *ts = ktime_to_timespec(t);
+ *ts = ktime_to_timespec64(t);
}
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
unsigned long get_seconds(void)
{
@@ -1756,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+ base = tk->tkr_mono.base;
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
@@ -1788,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->tkr.base_mono;
- nsecs = timekeeping_get_ns(&tk->tkr);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..ead8794b9a4e 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,14 @@ extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
+
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c504939..2ece3aa5069c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -90,8 +90,18 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
+
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
+
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int init_timers_cpu(int cpu)
-{
- int j;
- struct tvec_base *base;
- static char tvec_base_done[NR_CPUS];
-
- if (!tvec_base_done[cpu]) {
- static char boot_done;
-
- if (boot_done) {
- /*
- * The APs use this path later in boot
- */
- base = kzalloc_node(sizeof(*base), GFP_KERNEL,
- cpu_to_node(cpu));
- if (!base)
- return -ENOMEM;
-
- /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
- if (WARN_ON(base != tbase_get_base(base))) {
- kfree(base);
- return -ENOMEM;
- }
- per_cpu(tvec_bases, cpu) = base;
- } else {
- /*
- * This is for the boot CPU - we use compile-time
- * static initialisation because per-cpu memory isn't
- * ready yet and because the memory allocators are not
- * initialised either.
- */
- boot_done = 1;
- base = &boot_tvec_bases;
- }
- spin_lock_init(&base->lock);
- tvec_base_done[cpu] = 1;
- base->cpu = cpu;
- } else {
- base = per_cpu(tvec_bases, cpu);
- }
-
-
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
- base->timer_jiffies = jiffies;
- base->next_timer = base->timer_jiffies;
- base->active_timers = 0;
- base->all_timers = 0;
- return 0;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
{
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu)
migrate_timer_list(new_base, old_base->tv5.vec + i);
}
+ old_base->active_timers = 0;
+ old_base->all_timers = 0;
+
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
put_cpu_var(tvec_bases);
}
-#endif /* CONFIG_HOTPLUG_CPU */
static int timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- long cpu = (long)hcpu;
- int err;
-
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = init_timers_cpu(cpu);
- if (err < 0)
- return notifier_from_errno(err);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
+ switch (action) {
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- migrate_timers(cpu);
+ migrate_timers((long)hcpu);
break;
-#endif
default:
break;
}
+
return NOTIFY_OK;
}
-static struct notifier_block timers_nb = {
- .notifier_call = timer_cpu_notify,
-};
+static inline void timer_register_cpu_notifier(void)
+{
+ cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+ int j;
-void __init init_timers(void)
+ BUG_ON(base != tbase_get_base(base));
+
+ base->cpu = cpu;
+ per_cpu(tvec_bases, cpu) = base;
+ spin_lock_init(&base->lock);
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD(base->tv5.vec + j);
+ INIT_LIST_HEAD(base->tv4.vec + j);
+ INIT_LIST_HEAD(base->tv3.vec + j);
+ INIT_LIST_HEAD(base->tv2.vec + j);
+ }
+ for (j = 0; j < TVR_SIZE; j++)
+ INIT_LIST_HEAD(base->tv1.vec + j);
+
+ base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
+}
+
+static void __init init_timer_cpus(void)
{
- int err;
+ struct tvec_base *base;
+ int local_cpu = smp_processor_id();
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ if (cpu == local_cpu)
+ base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+ else
+ base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+
+ init_timer_cpu(base, cpu);
+ }
+}
+
+void __init init_timers(void)
+{
/* ensure there are enough low bits for flags in timer->base pointer */
BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
- err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
- BUG_ON(err != NOTIFY_OK);
-
+ init_timer_cpus();
init_timer_stats();
- register_cpu_notifier(&timers_nb);
+ timer_register_cpu_notifier();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 61ed862cdd37..e878c2e0ba45 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,10 +16,10 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-#include <linux/tick.h>
#include <asm/uaccess.h>
+#include "tick-internal.h"
struct timer_list_iter {
int cpu;
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_next_event);
SEQ_printf(m, "\n");
- SEQ_printf(m, " set_mode: ");
- print_name_offset(m, dev->set_mode);
- SEQ_printf(m, "\n");
+ if (dev->set_mode) {
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+ } else {
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
+ }
+ }
SEQ_printf(m, " event_handler: ");
print_name_offset(m, dev->event_handler);
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
-CFLAGS_trace_selftest_dynamic.o = -pg
+CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
obj-y += trace_selftest_dynamic.o
endif
endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fcc0e7052a79..5a2e0b53af30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1059,6 +1059,12 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_graph_active;
+#else
+# define ftrace_graph_active 0
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
static struct ftrace_ops *removed_ops;
@@ -2041,8 +2047,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (!ftrace_rec_count(rec))
rec->flags = 0;
else
- /* Just disable the record (keep REGS state) */
- rec->flags &= ~FTRACE_FL_ENABLED;
+ /*
+ * Just disable the record, but keep the ops TRAMP
+ * and REGS states. The _EN flags must be disabled though.
+ */
+ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
+ FTRACE_FL_REGS_EN);
}
return FTRACE_UPDATE_MAKE_NOP;
@@ -2688,24 +2698,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
static void ftrace_startup_sysctl(void)
{
+ int command;
+
if (unlikely(ftrace_disabled))
return;
/* Force update next time */
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
- if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
}
static void ftrace_shutdown_sysctl(void)
{
+ int command;
+
if (unlikely(ftrace_disabled))
return;
/* ftrace_start_up is true if ftrace is running */
- if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_DISABLE_CALLS);
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
}
static cycle_t ftrace_update_time;
@@ -5558,12 +5580,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ftrace_enabled) {
- ftrace_startup_sysctl();
-
/* we are starting ftrace again */
if (ftrace_ops_list != &ftrace_list_end)
update_ftrace_function();
+ ftrace_startup_sysctl();
+
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
ftrace_trace_function = ftrace_stub;
@@ -5590,8 +5612,6 @@ static struct ftrace_ops graph_ops = {
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
-static int ftrace_graph_active;
-
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
return 0;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96079180de3d..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -445,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
+ wait_queue_head_t full_waiters;
bool waiters_pending;
+ bool full_waiters_pending;
+ bool wakeup_full;
};
/*
@@ -527,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
+ if (rbwork->wakeup_full) {
+ rbwork->wakeup_full = false;
+ wake_up_all(&rbwork->full_waiters);
+ }
}
/**
@@ -551,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
* data in any cpu buffer, or a specific buffer, put the
* caller on the appropriate wait queue.
*/
- if (cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
- else {
+ /* Full only makes sense on per cpu reads */
+ full = false;
+ } else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
@@ -562,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
while (true) {
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
/*
* The events can happen in critical sections where
@@ -584,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
* that is necessary is that the wake up happens after
* a task has been queued. It's OK for spurious wake ups.
*/
- work->waiters_pending = true;
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
if (signal_pending(current)) {
ret = -EINTR;
@@ -613,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
schedule();
}
- finish_wait(&work->waiters, &wait);
+ if (full)
+ finish_wait(&work->full_waiters, &wait);
+ else
+ finish_wait(&work->waiters, &wait);
return ret;
}
@@ -1228,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
init_completion(&cpu_buffer->update_done);
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -2799,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
static __always_inline void
rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
+ bool pagebusy;
+
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
@@ -2810,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&cpu_buffer->irq_work.work);
}
+
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c8913bac204..bcfa2add6dda 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3355,12 +3355,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
mutex_lock(&tracing_cpumask_update_lock);
- len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
- if (count - len < 2) {
+ len = snprintf(mask_str, count, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask));
+ if (len >= count) {
count = -EINVAL;
goto out_err;
}
- len += sprintf(mask_str + len, "\n");
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
out_err:
@@ -4941,7 +4941,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
*fpos += written;
out_unlock:
- for (i = 0; i < nr_pages; i++){
+ for (i = nr_pages - 1; i >= 0; i--) {
kunmap_atomic(map_page[i]);
put_page(pages[i]);
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
}
void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+ struct pt_regs **regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
if (*rctxp < 0)
return NULL;
+ if (regs)
+ *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c1c6655847c8..ed998fbf09ce 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
return;
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index f8b45d8792f9..e694c9f9efa4 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
__trace_seq_init(s);
- seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+ seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
if (unlikely(seq_buf_has_overflowed(&s->seq))) {
s->seq.len = save_len;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size -= sizeof(u32);
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->event.type, regs, &rctx);
+ sys_data->enter_event->event.type, NULL, &rctx);
if (!rec)
return;
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size -= sizeof(u32);
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->event.type, regs, &rctx);
+ sys_data->exit_event->event.type, NULL, &rctx);
if (!rec)
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 5f0eba9e5e6b..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (hlist_empty(head))
goto out;
- entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
if (!entry)
goto out;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
*/
static unsigned long get_timestamp(void)
{
- return local_clock() >> 30LL; /* 2^30 ~= 10^9 */
+ return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
}
static void set_sample_period(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6202b08f1933..586ad91300b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -230,7 +231,7 @@ struct wq_device;
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
- struct list_head list; /* PL: list of all workqueues */
+ struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
@@ -257,6 +258,13 @@ struct workqueue_struct {
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* the per-cpu worker pools */
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -1841,17 +1850,11 @@ static void pool_mayday_timeout(unsigned long __pool)
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
*/
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
- if (!need_to_create_worker(pool))
- return false;
restart:
spin_unlock_irq(&pool->lock);
@@ -1877,7 +1880,6 @@ restart:
*/
if (need_to_create_worker(pool))
goto restart;
- return true;
}
/**
@@ -1897,16 +1899,14 @@ restart:
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
- * %false if the pool don't need management and the caller can safely start
- * processing works, %true indicates that the function released pool->lock
- * and reacquired it to perform some management function and that the
- * conditions that the caller verified while holding the lock before
- * calling the function might no longer be true.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- bool ret = false;
/*
* Anyone who successfully grabs manager_arb wins the arbitration
@@ -1919,12 +1919,14 @@ static bool manage_workers(struct worker *worker)
* actual management, the pool may stall indefinitely.
*/
if (!mutex_trylock(&pool->manager_arb))
- return ret;
+ return false;
+ pool->manager = worker;
- ret |= maybe_create_worker(pool);
+ maybe_create_worker(pool);
+ pool->manager = NULL;
mutex_unlock(&pool->manager_arb);
- return ret;
+ return true;
}
/**
@@ -2312,6 +2314,7 @@ repeat:
struct wq_barrier {
struct work_struct work;
struct completion done;
+ struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
@@ -2360,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ barr->task = current;
/*
* If @target is currently being executed, schedule the
@@ -2737,19 +2741,57 @@ bool flush_work(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(flush_work);
+struct cwt_wait {
+ wait_queue_t wait;
+ struct work_struct *work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+ if (cwait->work != key)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
+ static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(work, is_dwork, &flags);
/*
- * If someone else is canceling, wait for the same event it
- * would be waiting for before retrying.
+ * If someone else is already canceling, wait for it to
+ * finish. flush_work() doesn't work for PREEMPT_NONE
+ * because we may get scheduled between @work's completion
+ * and the other canceling task resuming and clearing
+ * CANCELING - flush_work() will return false immediately
+ * as @work is no longer busy, try_to_grab_pending() will
+ * return -ENOENT as @work is still being canceled and the
+ * other canceling task won't be able to clear CANCELING as
+ * we're hogging the CPU.
+ *
+ * Let's wait for completion using a waitqueue. As this
+ * may lead to the thundering herd problem, use a custom
+ * wake function which matches @work along with exclusive
+ * wait and wakeup.
*/
- if (unlikely(ret == -ENOENT))
- flush_work(work);
+ if (unlikely(ret == -ENOENT)) {
+ struct cwt_wait cwait;
+
+ init_wait(&cwait.wait);
+ cwait.wait.func = cwt_wakefn;
+ cwait.work = work;
+
+ prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (work_is_canceling(work))
+ schedule();
+ finish_wait(&cancel_waitq, &cwait.wait);
+ }
} while (unlikely(ret < 0));
/* tell other tasks trying to grab @work to back off */
@@ -2758,6 +2800,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
flush_work(work);
clear_work_data(work);
+
+ /*
+ * Paired with prepare_to_wait() above so that either
+ * waitqueue_active() is visible here or !work_is_canceling() is
+ * visible there.
+ */
+ smp_mb();
+ if (waitqueue_active(&cancel_waitq))
+ __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
return ret;
}
@@ -2950,324 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
- * following attributes.
- *
- * per_cpu RO bool : whether the workqueue is per-cpu or unbound
- * max_active RW int : maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- * id RO int : the associated pool ID
- * nice RW int : nice value of the workers
- * cpumask RW mask : bitmask of allowed CPUs for the workers
- */
-struct wq_device {
- struct workqueue_struct *wq;
- struct device dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
- struct device_attribute *attr, const char *buf,
- size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) != 1 || val <= 0)
- return -EINVAL;
-
- workqueue_set_max_active(wq, val);
- return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
- &dev_attr_per_cpu.attr,
- &dev_attr_max_active.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- const char *delim = "";
- int node, written = 0;
-
- rcu_read_lock_sched();
- for_each_node(node) {
- written += scnprintf(buf + written, PAGE_SIZE - written,
- "%s%d:%d", delim, node,
- unbound_pwq_by_node(wq, node)->pool->id);
- delim = " ";
- }
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
-
- return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
- struct workqueue_attrs *attrs;
-
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!attrs)
- return NULL;
-
- mutex_lock(&wq->mutex);
- copy_workqueue_attrs(attrs, wq->unbound_attrs);
- mutex_unlock(&wq->mutex);
- return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
- ret = apply_workqueue_attrs(wq, attrs);
- else
- ret = -EINVAL;
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
- mutex_unlock(&wq->mutex);
-
- written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = cpumask_parse(buf, attrs->cpumask);
- if (!ret)
- ret = apply_workqueue_attrs(wq, attrs);
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- int written;
-
- mutex_lock(&wq->mutex);
- written = scnprintf(buf, PAGE_SIZE, "%d\n",
- !wq->unbound_attrs->no_numa);
- mutex_unlock(&wq->mutex);
-
- return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct workqueue_struct *wq = dev_to_wq(dev);
- struct workqueue_attrs *attrs;
- int v, ret;
-
- attrs = wq_sysfs_prep_attrs(wq);
- if (!attrs)
- return -ENOMEM;
-
- ret = -EINVAL;
- if (sscanf(buf, "%d", &v) == 1) {
- attrs->no_numa = !v;
- ret = apply_workqueue_attrs(wq, attrs);
- }
-
- free_workqueue_attrs(attrs);
- return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
- __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
- __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
- __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
- __ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
- .name = "workqueue",
- .dev_groups = wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
- return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
- struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
- kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev;
- int ret;
-
- /*
- * Adjusting max_active or creating new pwqs by applyting
- * attributes breaks ordering guarantee. Disallow exposing ordered
- * workqueues.
- */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
- return -EINVAL;
-
- wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
- if (!wq_dev)
- return -ENOMEM;
-
- wq_dev->wq = wq;
- wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
- wq_dev->dev.release = wq_device_release;
-
- /*
- * unbound_attrs are created separately. Suppress uevent until
- * everything is ready.
- */
- dev_set_uevent_suppress(&wq_dev->dev, true);
-
- ret = device_register(&wq_dev->dev);
- if (ret) {
- kfree(wq_dev);
- wq->wq_dev = NULL;
- return ret;
- }
-
- if (wq->flags & WQ_UNBOUND) {
- struct device_attribute *attr;
-
- for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
- ret = device_create_file(&wq_dev->dev, attr);
- if (ret) {
- device_unregister(&wq_dev->dev);
- wq->wq_dev = NULL;
- return ret;
- }
- }
- }
-
- dev_set_uevent_suppress(&wq_dev->dev, false);
- kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
- return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
- struct wq_device *wq_dev = wq->wq_dev;
-
- if (!wq->wq_dev)
- return;
-
- wq->wq_dev = NULL;
- device_unregister(&wq_dev->dev);
-}
-#else /* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
-#endif /* CONFIG_SYSFS */
-
/**
* free_workqueue_attrs - free a workqueue_attrs
* @attrs: workqueue_attrs to free
@@ -3386,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3563,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
/*
* If we're the last pwq going away, @wq is already dead and no one
- * is gonna access it anymore. Free it.
+ * is gonna access it anymore. Schedule RCU free.
*/
- if (is_last) {
- free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq);
- }
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
}
/**
@@ -4105,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
- list_add(&wq->list, &workqueues);
+ list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
@@ -4161,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq)
* flushing is complete in case freeze races us.
*/
mutex_lock(&wq_pool_mutex);
- list_del_init(&wq->list);
+ list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
workqueue_sysfs_unregister(wq);
- if (wq->rescuer) {
+ if (wq->rescuer)
kthread_stop(wq->rescuer->task);
- kfree(wq->rescuer);
- wq->rescuer = NULL;
- }
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
- * free the pwqs and wq.
+ * schedule RCU free.
*/
- free_percpu(wq->cpu_pwqs);
- kfree(wq);
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
@@ -4399,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
@@ -4796,6 +4698,323 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq->unbound_attrs->cpumask));
+ mutex_unlock(&wq->mutex);
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+ return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
+ /*
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
+ */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
+
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;