diff options
Diffstat (limited to 'kernel')
126 files changed, 6048 insertions, 3138 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 76768ee812b2..08561f1acd13 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER def_bool y depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +config LOCK_SPIN_ON_OWNER + def_bool y + depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/Makefile b/kernel/Makefile index a59481a3fa6c..1408b3353a3c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \ ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_irq_work.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif # cond_syscall is currently not LTO compatible @@ -26,6 +26,7 @@ obj-y += power/ obj-y += printk/ obj-y += irq/ obj-y += rcu/ +obj-y += livepatch/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -142,7 +143,7 @@ endif kernel/system_certificates.o: $(obj)/x509_certificate_list quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)") targets += $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list diff --git a/kernel/acct.c b/kernel/acct.c index 33738ef972f3..e6c10d1a4058 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { struct fs_pin pin; + atomic_long_t count; + struct rcu_head rcu; struct mutex lock; int active; unsigned long needcheck; @@ -89,6 +90,8 @@ struct bsd_acct_struct { struct completion done; }; +static void do_acct_process(struct bsd_acct_struct *acct); + /* * Check the amount of free space and suspend/resume accordingly. */ @@ -124,32 +127,56 @@ out: return acct->active; } +static void acct_put(struct bsd_acct_struct *p) +{ + if (atomic_long_dec_and_test(&p->count)) + kfree_rcu(p, rcu); +} + +static inline struct bsd_acct_struct *to_acct(struct fs_pin *p) +{ + return p ? container_of(p, struct bsd_acct_struct, pin) : NULL; +} + static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; again: smp_rmb(); rcu_read_lock(); - res = ACCESS_ONCE(ns->bacct); + res = to_acct(ACCESS_ONCE(ns->bacct)); if (!res) { rcu_read_unlock(); return NULL; } - if (!atomic_long_inc_not_zero(&res->pin.count)) { + if (!atomic_long_inc_not_zero(&res->count)) { rcu_read_unlock(); cpu_relax(); goto again; } rcu_read_unlock(); mutex_lock(&res->lock); - if (!res->ns) { + if (res != to_acct(ACCESS_ONCE(ns->bacct))) { mutex_unlock(&res->lock); - pin_put(&res->pin); + acct_put(res); goto again; } return res; } +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct = to_acct(pin); + mutex_lock(&acct->lock); + do_acct_process(acct); + schedule_work(&acct->work); + wait_for_completion(&acct->done); + cmpxchg(&acct->ns->bacct, pin, NULL); + mutex_unlock(&acct->lock); + pin_remove(pin); + acct_put(acct); +} + static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); @@ -160,44 +187,13 @@ static void close_work(struct work_struct *work) complete(&acct->done); } -static void acct_kill(struct bsd_acct_struct *acct, - struct bsd_acct_struct *new) -{ - if (acct) { - struct pid_namespace *ns = acct->ns; - do_acct_process(acct); - INIT_WORK(&acct->work, close_work); - init_completion(&acct->done); - schedule_work(&acct->work); - wait_for_completion(&acct->done); - pin_remove(&acct->pin); - ns->bacct = new; - acct->ns = NULL; - atomic_long_dec(&acct->pin.count); - mutex_unlock(&acct->lock); - pin_put(&acct->pin); - } -} - -static void acct_pin_kill(struct fs_pin *pin) -{ - struct bsd_acct_struct *acct; - acct = container_of(pin, struct bsd_acct_struct, pin); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - pin_put(pin); - acct = NULL; - } - acct_kill(acct, NULL); -} - static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); - struct bsd_acct_struct *acct, *old; + struct bsd_acct_struct *acct; + struct fs_pin *old; int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); @@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname) mnt = file->f_path.mnt; file->f_path.mnt = internal; - atomic_long_set(&acct->pin.count, 1); - acct->pin.kill = acct_pin_kill; + atomic_long_set(&acct->count, 1); + init_fs_pin(&acct->pin, acct_pin_kill); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); - old = acct_get(ns); - if (old) - acct_kill(old, acct); - else - ns->bacct = acct; + rcu_read_lock(); + old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); + pin_kill(old); mnt_drop_write(mnt); mntput(mnt); return 0; @@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(task_active_pid_ns(current)), NULL); + rcu_read_lock(); + pin_kill(task_active_pid_ns(current)->bacct); } return error; @@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name) void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(ns), NULL); + rcu_read_lock(); + pin_kill(ns->bacct); } /* @@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns) if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); - pin_put(&acct->pin); + acct_put(acct); } } } diff --git a/kernel/audit.h b/kernel/audit.h index 3cdffad5a1d9..1caa0d345d90 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -24,12 +24,6 @@ #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> -/* 0 = no checking - 1 = put_count checking - 2 = verbose put_count checking -*/ -#define AUDIT_DEBUG 0 - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate * a name dynamically and also add those to the list anchored by names_list. */ @@ -74,9 +68,8 @@ struct audit_cap_data { }; }; -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). +/* When fs/namei.c:getname() is called, we store the pointer in name and bump + * the refcnt in the associated filename struct. * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ @@ -86,7 +79,6 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - bool name_put; /* call __putname()? */ unsigned long ino; dev_t dev; @@ -208,11 +200,6 @@ struct audit_context { }; int fds[2]; struct audit_proctitle proctitle; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif }; extern u32 audit_ever_enabled; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4f68a326d92e..72e1660a79a3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_nofree; bufp = data->buf; - entry->rule.vers_ops = 2; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; @@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) return ERR_PTR(-ENOMEM); new = &entry->rule; - new->vers_ops = old->vers_ops; new->flags = old->flags; new->pflags = old->pflags; new->listnr = old->listnr; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 072566dd0caf..dc4ae70a7413 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; -#if AUDIT_DEBUG == 2 - if (context->put_count + context->ino_count != context->name_count) { - int i = 0; - - pr_err("%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d ino_count=%d" - " [NOT freeing]\n", __FILE__, __LINE__, - context->serial, context->major, context->in_syscall, - context->name_count, context->put_count, - context->ino_count); - list_for_each_entry(n, &context->names_list, list) { - pr_err("names[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } - dump_stack(); - return; - } -#endif -#if AUDIT_DEBUG - context->put_count = 0; - context->ino_count = 0; -#endif - list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); - if (n->name && n->name_put) - final_putname(n->name); + if (n->name) + putname(n->name); if (n->should_free) kfree(n); } @@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif return aname; } @@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) + if (n->name->uptr == uptr) { + n->name->refcnt++; return n->name; + } } return NULL; } @@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name) struct audit_context *context = current->audit_context; struct audit_names *n; - if (!context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): ignoring getname(%p)\n", - __FILE__, __LINE__, context->serial, name); - dump_stack(); -#endif + if (!context->in_syscall) return; - } - -#if AUDIT_DEBUG - /* The filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) @@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; - n->name_put = true; name->aname = n; + name->refcnt++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } -/* audit_putname - intercept a putname request - * @name: name to intercept and delay for putname - * - * If we have stored the name from getname in the audit context, - * then we delay the putname until syscall exit. - * Called from include/linux/fs.h:putname(). - */ -void audit_putname(struct filename *name) -{ - struct audit_context *context = current->audit_context; - - BUG_ON(!context); - if (!name->aname || !context->in_syscall) { -#if AUDIT_DEBUG == 2 - pr_err("%s:%d(:%d): final_putname(%p)\n", - __FILE__, __LINE__, context->serial, name); - if (context->name_count) { - struct audit_names *n; - int i = 0; - - list_for_each_entry(n, &context->names_list, list) - pr_err("name[%d] = %p = %s\n", i++, n->name, - n->name->name ?: "(null)"); - } -#endif - final_putname(name); - } -#if AUDIT_DEBUG - else { - ++context->put_count; - if (context->put_count > context->name_count) { - pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" - " name_count=%d put_count=%d\n", - __FILE__, __LINE__, - context->serial, context->major, - context->in_syscall, name->name, - context->name_count, context->put_count); - dump_stack(); - } - } -#endif -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, if (!name) goto out_alloc; -#if AUDIT_DEBUG - /* The struct filename _must_ have a populated ->name */ - BUG_ON(!name->name); -#endif /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. @@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - if (!n->name || strcmp(n->name->name, name->name)) + if (n->ino) { + /* valid inode number, use that for the comparison */ + if (n->ino != inode->i_ino || + n->dev != inode->i_sb->s_dev) + continue; + } else if (n->name) { + /* inode number has not been set, check the name */ + if (strcmp(n->name->name, name->name)) + continue; + } else + /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ @@ -1882,44 +1810,11 @@ out_alloc: n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; - /* unfortunately, while we may have a path name to record with the - * inode, we can't always rely on the string lasting until the end of - * the syscall so we need to create our own copy, it may fail due to - * memory allocation issues, but we do our best */ if (name) { - /* we can't use getname_kernel() due to size limits */ - size_t len = strlen(name->name) + 1; - struct filename *new = __getname(); - - if (unlikely(!new)) - goto out; - - if (len <= (PATH_MAX - sizeof(*new))) { - new->name = (char *)(new) + sizeof(*new); - new->separate = false; - } else if (len <= PATH_MAX) { - /* this looks odd, but is due to final_putname() */ - struct filename *new2; - - new2 = kmalloc(sizeof(*new2), GFP_KERNEL); - if (unlikely(!new2)) { - __putname(new); - goto out; - } - new2->name = (char *)new; - new2->separate = true; - new = new2; - } else { - /* we should never get here, but let's be safe */ - __putname(new); - goto out; - } - strlcpy((char *)new->name, name->name, len); - new->uptr = NULL; - new->aname = n; - n->name = new; - n->name_put = true; + n->name = name; + name->refcnt++; } + out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; @@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent, /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name || n->type != AUDIT_TYPE_PARENT) + if (!n->name || + (n->type != AUDIT_TYPE_PARENT && + n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && + !audit_compare_dname_path(dname, + n->name->name, n->name_len)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = AUDIT_TYPE_PARENT; found_parent = n; break; } @@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent, /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ - if (!n->name || n->type != type) - continue; - - /* if we found a parent, make sure this one is a child of it */ - if (found_parent && (n->name != found_parent->name)) + if (!n->name || + (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; if (!strcmp(dname, n->name->name) || @@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent, found_parent ? found_parent->name_len : AUDIT_NAME_FULL)) { + if (n->type == AUDIT_TYPE_UNKNOWN) + n->type = type; found_child = n; break; } @@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - /* don't call __putname() */ - found_child->name_put = false; + found_child->name->refcnt++; } } + if (inode) audit_copy_inode(found_child, dentry, inode); else @@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = current->audit_context; struct cpu_vfs_cap_data vcaps; - struct dentry *dentry; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) @@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - dentry = dget(bprm->file->f_path.dentry); - get_vfs_caps_from_disk(dentry, &vcaps); - dput(dentry); + get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6594e457a25..a64e7a207d2b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - module_free(NULL, hdr); + module_memfree(hdr); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 088ac0b1b106..536edc2be307 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct fd f = fdget(ufd); struct bpf_map *map; - void *key, *value; + void *key, *value, *ptr; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ENOENT; - rcu_read_lock(); - value = map->ops->map_lookup_elem(map, key); + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); if (!value) - goto err_unlock; + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, map->value_size) != 0) - goto err_unlock; + goto free_value; err = 0; -err_unlock: - rcu_read_unlock(); +free_value: + kfree(value); free_key: kfree(key); err_put: diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bb263d0caab3..a220fdb66568 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) * * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self) || + if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else @@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, - NULL, false, key); + NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* @@ -4373,16 +4370,20 @@ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); - if (css->ss) { + if (ss) { /* css free path */ + int id = css->id; + if (css->parent) css_put(css->parent); - css->ss->css_free(css); + ss->css_free(css); + cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); } else { /* cgroup free path */ @@ -4434,7 +4435,7 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ - cgroup_idr_remove(&ss->css_idr, css->id); + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); } else { @@ -5036,6 +5037,9 @@ int __init cgroup_init(void) WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); diff --git a/kernel/compat.c b/kernel/compat.c index ebb3c369d03d..24f00610c575 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, * core implementation decides to return random nonsense. */ if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; @@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = compat_clock_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 5d220234b3ca..82eea9c5af61 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include <linux/gfp.h> #include <linux/suspend.h> #include <linux/lockdep.h> +#include <linux/tick.h> #include <trace/events/power.h> #include "smpboot.h" @@ -58,22 +59,23 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ + /* wait queue to wake up the active_writer */ + wait_queue_head_t wq; + /* verifies that no writer will get active while readers are active */ + struct mutex lock; /* * Also blocks the new readers during * an ongoing cpu hotplug operation. */ - int refcount; - /* And allows lockless put_online_cpus(). */ - atomic_t puts_pending; + atomic_t refcount; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } cpu_hotplug = { .active_writer = NULL, + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), - .refcount = 0, #ifdef CONFIG_DEBUG_LOCK_ALLOC .dep_map = {.name = "cpu_hotplug.lock" }, #endif @@ -86,15 +88,6 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -static void apply_puts_pending(int max) -{ - int delta; - - if (atomic_read(&cpu_hotplug.puts_pending) >= max) { - delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); - cpu_hotplug.refcount -= delta; - } -} void get_online_cpus(void) { @@ -103,8 +96,7 @@ void get_online_cpus(void) return; cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -116,8 +108,7 @@ bool try_get_online_cpus(void) if (!mutex_trylock(&cpu_hotplug.lock)) return false; cpuhp_lock_acquire_tryread(); - apply_puts_pending(65536); - cpu_hotplug.refcount++; + atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); return true; } @@ -125,20 +116,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus); void put_online_cpus(void) { + int refcount; + if (cpu_hotplug.active_writer == current) return; - if (!mutex_trylock(&cpu_hotplug.lock)) { - atomic_inc(&cpu_hotplug.puts_pending); - cpuhp_lock_release(); - return; - } - if (WARN_ON(!cpu_hotplug.refcount)) - cpu_hotplug.refcount++; /* try to fix things up */ + refcount = atomic_dec_return(&cpu_hotplug.refcount); + if (WARN_ON(refcount < 0)) /* try to fix things up */ + atomic_inc(&cpu_hotplug.refcount); + + if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) + wake_up(&cpu_hotplug.wq); - if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) - wake_up_process(cpu_hotplug.active_writer); - mutex_unlock(&cpu_hotplug.lock); cpuhp_lock_release(); } @@ -168,18 +157,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus); */ void cpu_hotplug_begin(void) { - cpu_hotplug.active_writer = current; + DEFINE_WAIT(wait); + cpu_hotplug.active_writer = current; cpuhp_lock_acquire(); + for (;;) { mutex_lock(&cpu_hotplug.lock); - apply_puts_pending(1); - if (likely(!cpu_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); + if (likely(!atomic_read(&cpu_hotplug.refcount))) + break; mutex_unlock(&cpu_hotplug.lock); schedule(); } + finish_wait(&cpu_hotplug.wq, &wait); } void cpu_hotplug_done(void) @@ -348,6 +339,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Give up timekeeping duties */ + tick_handover_do_timer(); /* Park the stopper thread */ kthread_park(current); return 0; @@ -421,10 +414,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) while (!idle_cpu(cpu)) cpu_relax(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ + tick_cleanup_dead_cpu(cpu); cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b257f6bca2..c68f0721df10 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -548,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) - continue; - /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -625,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -634,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -646,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.effective_cpus); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -669,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -751,6 +755,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -763,6 +768,7 @@ restart: BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* @@ -873,7 +879,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cpumask_empty(new_cpus)) + if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -1129,7 +1135,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (nodes_empty(*new_mems)) + if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1707,40 +1713,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; - ssize_t count; - char *buf, *s; int ret = 0; - count = seq_get_buf(sf, &buf); - s = buf; - spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: - s += cpulist_scnprintf(s, count, cs->cpus_allowed); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed)); break; case FILE_MEMLIST: - s += nodelist_scnprintf(s, count, cs->mems_allowed); + seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed)); break; case FILE_EFFECTIVE_CPULIST: - s += cpulist_scnprintf(s, count, cs->effective_cpus); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus)); break; case FILE_EFFECTIVE_MEMLIST: - s += nodelist_scnprintf(s, count, cs->effective_mems); + seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; default: ret = -EINVAL; - goto out_unlock; } - if (s < buf + count - 1) { - *s++ = '\n'; - seq_commit(sf, s - buf); - } else { - seq_commit(sf, -1); - } -out_unlock: spin_unlock_irq(&callback_lock); return ret; } @@ -1992,7 +1985,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); @@ -2400,7 +2395,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) */ } -void cpuset_init_current_mems_allowed(void) +void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } @@ -2610,8 +2605,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } -#define CPUSET_NODELIST_LEN (256) - /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed * @tsk: pointer to task_struct of some task. @@ -2621,23 +2614,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { - /* Statically allocated to prevent using excess stack. */ - static char cpuset_nodelist[CPUSET_NODELIST_LEN]; - static DEFINE_SPINLOCK(cpuset_buffer_lock); struct cgroup *cgrp; - spin_lock(&cpuset_buffer_lock); rcu_read_lock(); cgrp = task_cs(tsk)->css.cgroup; - nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, - tsk->mems_allowed); pr_info("%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%s\n", cpuset_nodelist); + pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); rcu_read_unlock(); - spin_unlock(&cpuset_buffer_lock); } /* @@ -2715,10 +2701,8 @@ out: /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_puts(m, "Mems_allowed:\t"); - seq_nodemask(m, &task->mems_allowed); - seq_puts(m, "\n"); - seq_puts(m, "Mems_allowed_list:\t"); - seq_nodemask_list(m, &task->mems_allowed); - seq_puts(m, "\n"); + seq_printf(m, "Mems_allowed:\t%*pb\n", + nodemask_pr_args(&task->mems_allowed)); + seq_printf(m, "Mems_allowed_list:\t%*pbl\n", + nodemask_pr_args(&task->mems_allowed)); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 07ce18ca71e0..0874e2edd275 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -604,7 +604,7 @@ return_normal: online_cpus) cpu_relax(); if (!time_left) - pr_crit("KGDB: Timed out waiting for secondary CPUs.\n"); + pr_crit("Timed out waiting for secondary CPUs.\n"); /* * At this point the primary processor is completely @@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + /* + * Avoid entering the debugger if we were triggered due to an oops + * but panic_timeout indicates the system should automatically + * reboot on panic. We don't want to get stuck waiting for input + * on such systems, especially if its "just" an oops. + */ + if (signo != SIGTRAP && panic_timeout) + return 1; memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); @@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self, unsigned long val, void *data) { + /* + * Avoid entering the debugger if we were triggered due to a panic + * We don't want to get stuck waiting for input from user in such case. + * panic_timeout indicates the system should automatically + * reboot on panic. + */ + if (panic_timeout) + return NOTIFY_DONE; + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", (char *)data); kgdb_breakpoint(); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 7c70812caea5..fc1ef736253c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -439,7 +439,7 @@ poll_again: * substituted for %d, %x or %o in the prompt. */ -char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); @@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } -int vkdb_printf(const char *fmt, va_list ap) +int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; int linecount; @@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + /* + * This was a interactive search (using '/' at more + * prompt) and it has completed. Clear the flag. + */ + kdb_grepping_flag = 0; /* * at this point the string is a full line and * should be printed, up to the null. @@ -691,19 +697,20 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); + cp = (char *) printk_skip_level(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { - gdbstub_msg_write(kdb_buffer, retlen); + gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen; - cp = kdb_buffer; + len = retlen - (cp - kdb_buffer); + cp2 = cp; while (len--) { - dbg_io_ops->write_char(*cp); - cp++; + dbg_io_ops->write_char(*cp2); + cp2++; } } while (c) { - c->write(c, kdb_buffer, retlen); + c->write(c, cp, retlen - (cp - kdb_buffer)); touch_nmi_watchdog(); c = c->next; } @@ -711,7 +718,10 @@ kdb_printit: if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; - printk(KERN_INFO "%s", kdb_buffer); + if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK) + printk("%s", kdb_buffer); + else + pr_info("%s", kdb_buffer); } if (KDB_STATE(PAGER)) { @@ -794,11 +804,23 @@ kdb_printit: kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '/' && !kdb_grepping_flag) { + kdb_printf("\r"); + kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, + kdbgetenv("SEARCHPROMPT") ?: "search> "); + *strchrnul(kdb_grep_string, '\n') = '\0'; + kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; + suspend_grep = 1; /* for this recursion */ } else if (buf1[0] && buf1[0] != '\n') { /* user hit something other than enter */ suspend_grep = 1; /* for this recursion */ - kdb_printf("\nOnly 'q' or 'Q' are processed at more " - "prompt, input ignored\n"); + if (buf1[0] != '/') + kdb_printf( + "\nOnly 'q', 'Q' or '/' are processed at " + "more prompt, input ignored\n"); + else + kdb_printf("\n'/' cannot be used during | " + "grep filtering, input ignored\n"); } else if (kdb_grepping_flag) { /* user hit enter */ suspend_grep = 1; /* for this recursion */ @@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...) int r; va_start(ap, fmt); - r = vkdb_printf(fmt, ap); + r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap); va_end(ap); return r; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index f191bddf64b8..4121345498e0 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -50,8 +50,7 @@ static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE; module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600); -#define GREP_LEN 256 -char kdb_grep_string[GREP_LEN]; +char kdb_grep_string[KDB_GREP_STRLEN]; int kdb_grepping_flag; EXPORT_SYMBOL(kdb_grepping_flag); int kdb_grep_leading; @@ -870,7 +869,7 @@ static void parse_grep(const char *str) len = strlen(cp); if (!len) return; - if (len >= GREP_LEN) { + if (len >= KDB_GREP_STRLEN) { kdb_printf("search string too long\n"); return; } @@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr) char *cp; char *cpp, quoted; kdbtab_t *tp; - int i, escaped, ignore_errors = 0, check_grep; + int i, escaped, ignore_errors = 0, check_grep = 0; /* * First tokenize the command string. */ cp = (char *)cmdstr; - kdb_grepping_flag = check_grep = 0; if (KDB_FLAG(CMD_INTERRUPT)) { /* Previous command was interrupted, newline must not @@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); - kdb_dumpregs(regs); break; case KDB_REASON_SSTEP: case KDB_REASON_BREAK: @@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ kdb_nextline = 1; KDB_STATE_CLEAR(SUPPRESS); + kdb_grepping_flag = 0; + /* ensure the old search does not leak into '/' commands */ + kdb_grep_string[0] = '\0'; cmdbuf = cmd_cur; *cmdbuf = '\0'; @@ -2023,7 +2023,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); + kdb_printf("%4d ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); @@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv) /* * Validate cpunum */ - if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) + if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb) return KDB_BADCPUNUM; dbg_switch_cpu = cpunum; @@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv) #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" "Buffers: %8lu kB\n", - val.totalram, val.freeram, val.bufferram); + K(val.totalram), K(val.freeram), K(val.bufferram)); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index eaacd1693954..75014d7f4568 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, /* Miscellaneous functions and data areas */ extern int kdb_grepping_flag; +#define KDB_GREPPING_FLAG_SEARCH 0x8000 extern char kdb_grep_string[]; +#define KDB_GREP_STRLEN 256 extern int kdb_grep_leading; extern int kdb_grep_trailing; extern char *kdb_cmds[]; @@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); -extern char *kdb_getstr(char *, size_t, char *); +extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); /* Defines for kdb_symbol_print */ diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 103f5d147b2f..2925188f50ea 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = -pg +CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) endif obj-y := core.o ring_buffer.o callchain.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 882f835a0d85..2fabc0627165 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, rotation_list); +static DEFINE_PER_CPU(struct list_head, active_ctx_list); /* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. + * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and + * perf_event_task_tick() are fully serialized because they're strictly cpu + * affine and perf_event_ctx{activate,deactivate} are called with IRQs + * disabled, while perf_event_task_tick is called from IRQ context. */ -static void perf_pmu_rotate_start(struct pmu *pmu) +static void perf_event_ctx_activate(struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = this_cpu_ptr(&rotation_list); + struct list_head *head = this_cpu_ptr(&active_ctx_list); WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); + WARN_ON(!list_empty(&ctx->active_ctx_list)); + + list_add(&ctx->active_ctx_list, head); +} + +static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +{ + WARN_ON(!irqs_disabled()); + + WARN_ON(list_empty(&ctx->active_ctx_list)); + + list_del_init(&ctx->active_ctx_list); } static void get_ctx(struct perf_event_context *ctx) @@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx) } /* + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and + * perf_pmu_migrate_context() we need some magic. + * + * Those places that change perf_event::ctx will hold both + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. + * + * Lock ordering is by mutex address. There is one other site where + * perf_event_context::mutex nests and that is put_event(). But remember that + * that is a parent<->child context relation, and migration does not affect + * children, therefore these two orderings should not interact. + * + * The change in perf_event::ctx does not affect children (as claimed above) + * because the sys_perf_event_open() case will install a new event and break + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only + * concerned with cpuctx and that doesn't have children. + * + * The places that change perf_event::ctx will issue: + * + * perf_remove_from_context(); + * synchronize_rcu(); + * perf_install_in_context(); + * + * to affect the change. The remove_from_context() + synchronize_rcu() should + * quiesce the event, after which we can install it in the new location. This + * means that only external vectors (perf_fops, prctl) can perturb the event + * while in transit. Therefore all such accessors should also acquire + * perf_event_context::mutex to serialize against this. + * + * However; because event->ctx can change while we're waiting to acquire + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() + * function. + * + * Lock order: + * task_struct::perf_event_mutex + * perf_event_context::mutex + * perf_event_context::lock + * perf_event::child_mutex; + * perf_event::mmap_mutex + * mmap_sem + */ +static struct perf_event_context * +perf_event_ctx_lock_nested(struct perf_event *event, int nesting) +{ + struct perf_event_context *ctx; + +again: + rcu_read_lock(); + ctx = ACCESS_ONCE(event->ctx); + if (!atomic_inc_not_zero(&ctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_nested(&ctx->mutex, nesting); + if (event->ctx != ctx) { + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + + return ctx; +} + +static inline struct perf_event_context * +perf_event_ctx_lock(struct perf_event *event) +{ + return perf_event_ctx_lock_nested(event, 0); +} + +static void perf_event_ctx_unlock(struct perf_event *event, + struct perf_event_context *ctx) +{ + mutex_unlock(&ctx->mutex); + put_ctx(ctx); +} + +/* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. @@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_branch_stack++; list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event) if (group_leader == event) return; + WARN_ON_ONCE(group_leader->ctx != event->ctx); + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && !is_software_event(event)) group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; @@ -1296,6 +1384,10 @@ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * We can have double detach due to exit/hot-unplug + close. */ @@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; + + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: @@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event, { u64 tstamp = perf_event_time(event); u64 delta; + + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); + /* * An event which could not be activated because of * filter mismatch still needs to have its timings @@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; - ctx->nr_active--; + if (!--ctx->nr_active) + perf_event_ctx_deactivate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) @@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -void perf_event_disable(struct perf_event *event) +static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -1695,6 +1794,19 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } + +/* + * Strictly speaking kernel users cannot create groups and therefore this + * interface does not need the perf_event_ctx_lock() magic. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_disable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, @@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; - ctx->nr_active++; + if (!ctx->nr_active++) + perf_event_ctx_activate(ctx); if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; @@ -2158,7 +2271,7 @@ unlock: * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -void perf_event_enable(struct perf_event *event) +static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -2214,9 +2327,21 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } + +/* + * See perf_event_disable(); + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + _perf_event_enable(event); + perf_event_ctx_unlock(event, ctx); +} EXPORT_SYMBOL_GPL(perf_event_enable); -int perf_event_refresh(struct perf_event *event, int refresh) +static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh) return -EINVAL; atomic_add(refresh, &event->event_limit); - perf_event_enable(event); + _perf_event_enable(event); return 0; } + +/* + * See perf_event_disable() + */ +int perf_event_refresh(struct perf_event *event, int refresh) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_refresh(event, refresh); + perf_event_ctx_unlock(event, ctx); + + return ret; +} EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, @@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); } /* @@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx) list_rotate_left(&ctx->flexible_groups); } -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ static int perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0; if (cpuctx->ctx.nr_events) { - remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } @@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: - if (remove) - list_del_init(&cpuctx->rotation_list); return rotate; } @@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - struct perf_event_context *ctx; + struct list_head *head = this_cpu_ptr(&active_ctx_list); + struct perf_event_context *ctx, *tmp; int throttled; WARN_ON(!irqs_disabled()); @@ -2976,14 +3100,8 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - ctx = &cpuctx->ctx; + list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); - - ctx = cpuctx->task_ctx; - if (ctx) - perf_adjust_freq_unthr_context(ctx, throttled); - } } static int event_enable_on_exec(struct perf_event *event, @@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->active_ctx_list); INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); @@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event) rcu_read_unlock(); if (owner) { - mutex_lock(&owner->perf_event_mutex); + /* + * If we're here through perf_event_exit_task() we're already + * holding ctx->mutex which would be an inversion wrt. the + * normal lock order. + * + * However we can safely take this lock because its the child + * ctx->mutex. + */ + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); + /* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex @@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event) */ static void put_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; + struct perf_event_context *ctx; if (!atomic_long_dec_and_test(&event->refcount)) return; @@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event) if (!is_kernel_event(event)) perf_remove_from_owner(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: * @@ -3461,9 +3588,10 @@ static void put_event(struct perf_event *event) * the last filedesc died, so there is no possibility * to trigger the AB-BA case. */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); + perf_event_ctx_unlock(event, ctx); _free_event(event); } @@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; struct perf_event_context *ctx = leader->ctx; - u64 values[5]; + int n = 0, size = 0, ret; u64 count, enabled, running; + u64 values[5]; + + lockdep_assert_held(&ctx->mutex); - mutex_lock(&ctx->mutex); count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; @@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - goto unlock; + return -EFAULT; ret = size; @@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event, size = n * sizeof(u64); if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; + return -EFAULT; } ret += size; } -unlock: - mutex_unlock(&ctx->mutex); return ret; } @@ -3660,8 +3786,14 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = perf_read_hw(event, buf, count); + perf_event_ctx_unlock(event, ctx); - return perf_read_hw(event, buf, count); + return ret; } static unsigned int perf_poll(struct file *file, poll_table *wait) @@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } -static void perf_event_reset(struct perf_event *event) +static void _perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); local64_set(&event->count, 0); @@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event, struct perf_event *child; WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); func(event); list_for_each_entry(child, &event->child_list, child_list) @@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event, struct perf_event_context *ctx = event->ctx; struct perf_event *sibling; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + lockdep_assert_held(&ctx->mutex); + event = event->group_leader; perf_event_for_each_child(event, func); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); - mutex_unlock(&ctx->mutex); } static int perf_event_period(struct perf_event *event, u64 __user *arg) @@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { - struct perf_event *event = file->private_data; void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; + func = _perf_event_enable; break; case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; + func = _perf_event_disable; break; case PERF_EVENT_IOC_RESET: - func = perf_event_reset; + func = _perf_event_reset; break; case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); + return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); @@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx; + long ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_ioctl(event, cmd, arg); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + #ifdef CONFIG_COMPAT static long perf_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, int perf_event_task_enable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_enable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3895,11 +4043,15 @@ int perf_event_task_enable(void) int perf_event_task_disable(void) { + struct perf_event_context *ctx; struct perf_event *event; mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { + ctx = perf_event_ctx_lock(event); + perf_event_for_each_child(event, _perf_event_disable); + perf_event_ctx_unlock(event, ctx); + } mutex_unlock(¤t->perf_event_mutex); return 0; @@ -3949,7 +4101,8 @@ unlock: rcu_read_unlock(); } -void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage( + struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { } @@ -3999,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - arch_perf_update_userpage(userpg, now); + arch_perf_update_userpage(event, userpg, now); barrier(); ++userpg->lock; @@ -4141,6 +4294,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + + if (event->pmu->event_mapped) + event->pmu->event_mapped(event); } /* @@ -4160,6 +4316,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + if (event->pmu->event_unmapped) + event->pmu->event_unmapped(event); + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4361,6 +4520,9 @@ unlock: vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; + if (event->pmu->event_mapped) + event->pmu->event_mapped(event); + return ret; } @@ -4412,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -4422,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* @@ -5889,6 +6061,8 @@ end: rcu_read_unlock(); } +DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); + int perf_swevent_get_recursion_context(void) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); @@ -5904,21 +6078,30 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; - int rctx; - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) + if (WARN_ON_ONCE(!regs)) return; perf_sample_data_init(&data, addr, 0); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); +} + +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) +{ + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (unlikely(rctx < 0)) + goto fail; + + ___perf_sw_event(event_id, nr, regs, addr); perf_swevent_put_recursion_context(rctx); +fail: preempt_enable_notrace(); } @@ -6776,12 +6959,10 @@ skip_type: __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; __perf_cpu_hrtimer_init(cpuctx, cpu); - INIT_LIST_HEAD(&cpuctx->rotation_list); cpuctx->unique_pmu = pmu; } @@ -6854,6 +7035,20 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) +{ + int ret; + + if (!try_module_get(pmu->module)) + return -ENODEV; + event->pmu = pmu; + ret = pmu->event_init(event); + if (ret) + module_put(pmu->module); + + return ret; +} + struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; @@ -6866,24 +7061,14 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (ret) pmu = ERR_PTR(ret); goto unlock; } list_for_each_entry_rcu(pmu, &pmus, entry) { - if (!try_module_get(pmu->module)) { - pmu = ERR_PTR(-ENODEV); - goto unlock; - } - event->pmu = pmu; - ret = pmu->event_init(event); + ret = perf_try_init_event(pmu, event); if (!ret) goto unlock; @@ -7247,6 +7432,15 @@ out: return ret; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7262,7 +7456,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; + struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -7420,7 +7614,19 @@ SYSCALL_DEFINE5(perf_event_open, * task or CPU context: */ if (move_group) { - if (group_leader->ctx->type != ctx->type) + /* + * Make sure we're both on the same task, or both + * per-cpu events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) goto err_context; } else { if (group_leader->ctx != ctx) @@ -7448,43 +7654,68 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader, false); + gctx = group_leader->ctx; /* - * Removing from the context ends up with disabled - * event. What we want here is event in the initial - * startup state, ready to be add into new context. + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. */ - perf_event__state_init(group_leader); + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + perf_remove_from_context(group_leader, false); + list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { perf_remove_from_context(sibling, false); - perf_event__state_init(sibling); put_ctx(gctx); } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); + } else { + mutex_lock(&ctx->mutex); } WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); if (move_group) { + /* + * Wait for everybody to stop referencing the events through + * the old lists, before installing it on new lists. + */ synchronize_rcu(); - perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); + + /* + * Install the group siblings before the group leader. + * + * Because a group leader will try and install the entire group + * (through the sibling list, which is still in-tact), we can + * end up with siblings installed in the wrong context. + * + * By installing siblings first we NO-OP because they're not + * reachable through the group lists. + */ list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { + perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } + + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + perf_event__state_init(group_leader); + perf_install_in_context(ctx, group_leader, group_leader->cpu); + get_ctx(ctx); } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); + + if (move_group) { + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -7592,7 +7823,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; - mutex_lock(&src_ctx->mutex); + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event, false); @@ -7600,11 +7835,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) put_ctx(src_ctx); list_add(&event->migrate_entry, &events); } - mutex_unlock(&src_ctx->mutex); + /* + * Wait for the events to quiesce before re-instating them. + */ synchronize_rcu(); - mutex_lock(&dst_ctx->mutex); + /* + * Re-instate events in 2 passes. + * + * Skip over group leaders and only install siblings on this first + * pass, siblings will not get enabled without a leader, however a + * leader will enable its siblings, even if those are still on the old + * context. + */ + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + if (event->group_leader == event) + continue; + + list_del(&event->migrate_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + + /* + * Once all the siblings are setup properly, install the group leaders + * to make it go. + */ list_for_each_entry_safe(event, tmp, &events, migrate_entry) { list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) @@ -7614,6 +7874,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) get_ctx(dst_ctx); } mutex_unlock(&dst_ctx->mutex); + mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); @@ -7800,14 +8061,19 @@ static void perf_free_event(struct perf_event *event, put_event(parent); + raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); free_event(event); } /* - * free an unexposed, unused context as created by inheritance by + * Free an unexposed, unused context as created by inheritance by * perf_event_init_task below, used by fork() in case of fail. + * + * Not all locks are strictly required, but take them anyway to be nice and + * help out with the lockdep assertions. */ void perf_event_free_task(struct task_struct *task) { @@ -8126,7 +8392,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); } } @@ -8147,22 +8413,11 @@ static void perf_event_init_cpu(int cpu) } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - static void __perf_event_exit_context(void *__info) { struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; - perf_pmu_rotate_stop(ctx->pmu); - rcu_read_lock(); list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) __perf_remove_from_context(&re); @@ -8273,6 +8528,18 @@ void __init perf_event_init(void) != 1024); } +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + if (pmu_attr->event_str) + return sprintf(page, "%s\n", pmu_attr->event_str); + + return 0; +} + static int __init perf_event_sysfs_init(void) { struct pmu *pmu; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 146a5792b1d2..eadb95ce7aac 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -13,12 +13,13 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> +#include <linux/poll.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLL_IN); + atomic_set(&handle->rb->poll, POLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); diff --git a/kernel/exit.c b/kernel/exit.c index 6806c55475ee..feff10bbb307 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); - clear_thread_flag(TIF_MEMDIE); + if (test_thread_flag(TIF_MEMDIE)) + unmark_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index 4dc2ddade9f1..cf65139615a0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - if (unlikely(tmp->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(tmp, - &mapping->i_mmap_nonlinear); - else - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } @@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm_nr_pmds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } + + if (atomic_long_read(&mm->nr_ptes)) + pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", + atomic_long_read(&mm->nr_ptes)); + if (mm_nr_pmds(mm)) + pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", + mm_nr_pmds(mm)); + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif diff --git a/kernel/futex.c b/kernel/futex.c index 63678b573d61..2579e407ff67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, if (!p) return -ESRCH; - if (!p->mm) { + if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); return -EPERM; } @@ -2217,7 +2217,7 @@ retry: if (!abs_time) goto out; - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; @@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; @@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - return futex_lock_pi(uaddr, flags, val, timeout, 0); + return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - return futex_lock_pi(uaddr, flags, 0, timeout, 1); + return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 52aa7e8de927..752d6486b67e 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,33 +1,7 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -# if-lt -# Usage VAR := $(call if-lt, $(a), $(b)) -# Returns 1 if (a < b) -if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) - -ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) - cc-ver := 0304 -else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) - cc-ver := 0407 -else -# Use cc-version if available, otherwise set 0 -# -# scripts/Kbuild.include, which contains cc-version function, is not included -# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" -# Meaning cc-ver is empty causing if-lt test to fail with -# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. -# This has no affect on the clean phase, but the error message could be -# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version -# is not available. We can probably move if-lt to Kbuild.include, so it's also -# not defined during clean or to include Kbuild.include in -# scripts/Makefile.clean. But the following workaround seems least invasive. - cc-ver := $(if $(call cc-version),$(call cc-version),0) -endif - -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o - -ifeq ($(call if-lt, $(cc-ver), 0407),1) - obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o -else - obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o -endif +obj-y := base.o fs.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ + gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6f1c7a566b95..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return -ENOSYS; } + +/** + * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt + * @data: Pointer to interrupt specific data + * @on: Whether to set or reset the wake-up capability of this irq + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) +{ + data = data->parent_data; + if (data->chip->irq_set_wake) + return data->chip->irq_set_wake(data, on); + + return -ENOSYS; +} #endif /** diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 80692373abd6..e68932bb308e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Do not use this for shutdown scenarios where you must be sure * that all parts (hardirq and threaded handler) have completed. * + * Returns: false if a threaded handler is active. + * * This function may be called - with care - from IRQ context. */ -void synchronize_hardirq(unsigned int irq) +bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) + if (desc) { __synchronize_hardirq(desc); + return !atomic_read(&desc->threads_active); + } + + return true; } EXPORT_SYMBOL(synchronize_hardirq); @@ -243,6 +249,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); + /* set the initial affinity to prevent every interrupt being on CPU0 */ + if (m) + __irq_set_affinity(irq, m, false); return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); @@ -437,6 +446,32 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); +/** + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this function while + * holding a resource the hard IRQ handler may need you will deadlock. + * + * When used to optimistically disable an interrupt from atomic context + * the return value must be checked. + * + * Returns: false if a threaded handler is active. + * + * This function may be called - with care - from IRQ context. + */ +bool disable_hardirq(unsigned int irq) +{ + if (!__disable_irq_nosync(irq)) + return synchronize_hardirq(irq); + + return false; +} +EXPORT_SYMBOL_GPL(disable_hardirq); + void __enable_irq(struct irq_desc *desc, unsigned int irq) { switch (desc->depth) { @@ -1471,8 +1506,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). + * + * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and + * it cannot be set along with IRQF_NO_SUSPEND. */ - if ((irqflags & IRQF_SHARED) && !dev_id) + if (((irqflags & IRQF_SHARED) && !dev_id) || + (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || + ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); @@ -1758,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } + +/** + * irq_get_irqchip_state - returns the irqchip state of a interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: One of IRQCHIP_STATE_* the caller wants to know about + * @state: a pointer to a boolean where the state is to be storeed + * + * This call snapshots the internal irqchip state of an + * interrupt, returning into @state the bit corresponding to + * stage @which + * + * This function should be called with preemption disabled if the + * interrupt controller has per-cpu registers. + */ +int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, + bool *state) +{ + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + unsigned long flags; + int err = -EINVAL; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return err; + + data = irq_desc_get_irq_data(desc); + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_get_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_get_irqchip_state(data, which, state); + + irq_put_desc_busunlock(desc, flags); + return err; +} + +/** + * irq_set_irqchip_state - set the state of a forwarded interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * + * This function should be called with preemption disabled if the + * interrupt controller has per-cpu registers. + */ +int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, + bool val) +{ + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + unsigned long flags; + int err = -EINVAL; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return err; + + data = irq_desc_get_irq_data(desc); + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_set_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_set_irqchip_state(data, which, val); + + irq_put_desc_busunlock(desc, flags); + return err; +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3e18163f336f..474de5cb394d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) struct msi_desc *desc; for_each_msi_entry(desc, dev) { - irq_domain_free_irqs(desc->irq, desc->nvec_used); - desc->irq = 0; + /* + * We might have failed to allocate an MSI early + * enough that there is no IRQ associated to this + * entry. If that's the case, don't do anything. + */ + if (desc->irq) { + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; + } } } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 3ca532592704..5204a6d1b985 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - desc->no_suspend_depth != desc->nr_actions); + (desc->no_suspend_depth + + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc, int irq) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9dc9bfd8a678..df2f4642d1e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v) mask = desc->pending_mask; #endif if (type) - seq_cpumask_list(m, mask); + seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); else - seq_cpumask(m, mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); return 0; } @@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) cpumask_copy(mask, desc->affinity_hint); raw_spin_unlock_irqrestore(&desc->lock, flags); - seq_cpumask(m, mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return 0; @@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = { static int default_affinity_show(struct seq_file *m, void *v) { - seq_cpumask(m, irq_default_affinity); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 9a8a01abbaed..38c25b1f2fd5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, } /* - * Free up memory used by kernel, initrd, and comand line. This is temporary + * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have * been loaded into separate segments and have been copied elsewhere. */ @@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image, destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) - image->destination = destination; return result; } @@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page) page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) - image->destination += PAGE_SIZE; return result; } @@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > 0) { unsigned long i; - /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - /* Loading another kernel to switch to if this one crashes */ - else if (flags & KEXEC_ON_CRASH) { - /* Free any current crash dump kernel before + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before * we corrupt it. */ + kimage_free(xchg(&kexec_crash_image, NULL)); result = kimage_alloc_init(&image, entry, nr_segments, segments, flags); crash_map_reserved_pages(); + } else { + /* Loading another kernel to reboot into. */ + + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); } if (result) goto out; @@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image) continue; /* - * Respective archicture needs to provide support for applying + * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ if (sechdrs[i].sh_type == SHT_RELA) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 06f58309fed2..c90e417bb963 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) static void free_insn_page(void *page) { - module_free(NULL, page); + module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { @@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p) struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ @@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op); + arch_prepare_optimized_kprobe(op, p); return &op->kp; } @@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; - unoptimize_kprobe(p, false); /* Try to unoptimize */ + /* Try to unoptimize */ + unoptimize_kprobe(p, kprobes_all_disarmed); if (!kprobe_queued(p)) { arch_disarm_kprobe(p); @@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p, true); + /* + * If kprobes_all_disarmed is set, orig_p + * should have already been disarmed, so + * skip unneed disarming process. + */ + if (!kprobes_all_disarmed) + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void) if (!kprobes_all_disarmed) goto already_enabled; + /* + * optimize_kprobe() called by arm_kprobe() checks + * kprobes_all_disarmed, so set kprobes_all_disarmed before + * arm_kprobe. + */ + kprobes_all_disarmed = false; /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void) arm_kprobe(p); } - kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); already_enabled: diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig new file mode 100644 index 000000000000..045022557936 --- /dev/null +++ b/kernel/livepatch/Kconfig @@ -0,0 +1,18 @@ +config HAVE_LIVEPATCH + bool + help + Arch supports kernel live patching + +config LIVEPATCH + bool "Kernel Live Patching" + depends on DYNAMIC_FTRACE_WITH_REGS + depends on MODULES + depends on SYSFS + depends on KALLSYMS_ALL + depends on HAVE_LIVEPATCH + help + Say Y here if you want to support kernel live patching. + This option has no runtime impact until a kernel "patch" + module uses the interface provided by this option to register + a patch, causing calls to patched functions to be redirected + to new function code contained in the patch module. diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile new file mode 100644 index 000000000000..e8780c0901d9 --- /dev/null +++ b/kernel/livepatch/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LIVEPATCH) += livepatch.o + +livepatch-objs := core.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c new file mode 100644 index 000000000000..284e2691e380 --- /dev/null +++ b/kernel/livepatch/core.c @@ -0,0 +1,1003 @@ +/* + * core.c - Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/ftrace.h> +#include <linux/list.h> +#include <linux/kallsyms.h> +#include <linux/livepatch.h> + +/** + * struct klp_ops - structure for tracking registered ftrace ops structs + * + * A single ftrace_ops is shared between all enabled replacement functions + * (klp_func structs) which have the same old_addr. This allows the switch + * between function versions to happen instantaneously by updating the klp_ops + * struct's func_stack list. The winner is the klp_func at the top of the + * func_stack (front of the list). + * + * @node: node for the global klp_ops list + * @func_stack: list head for the stack of klp_func's (active func is on top) + * @fops: registered ftrace ops struct + */ +struct klp_ops { + struct list_head node; + struct list_head func_stack; + struct ftrace_ops fops; +}; + +/* + * The klp_mutex protects the global lists and state transitions of any + * structure reachable from them. References to any structure must be obtained + * under mutex protection (except in klp_ftrace_handler(), which uses RCU to + * ensure it gets consistent data). + */ +static DEFINE_MUTEX(klp_mutex); + +static LIST_HEAD(klp_patches); +static LIST_HEAD(klp_ops); + +static struct kobject *klp_root_kobj; + +static struct klp_ops *klp_find_ops(unsigned long old_addr) +{ + struct klp_ops *ops; + struct klp_func *func; + + list_for_each_entry(ops, &klp_ops, node) { + func = list_first_entry(&ops->func_stack, struct klp_func, + stack_node); + if (func->old_addr == old_addr) + return ops; + } + + return NULL; +} + +static bool klp_is_module(struct klp_object *obj) +{ + return obj->name; +} + +static bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +/* sets obj->mod if object is not vmlinux and module is found */ +static void klp_find_object_module(struct klp_object *obj) +{ + struct module *mod; + + if (!klp_is_module(obj)) + return; + + mutex_lock(&module_mutex); + /* + * We do not want to block removal of patched modules and therefore + * we do not take a reference here. The patches are removed by + * a going module handler instead. + */ + mod = find_module(obj->name); + /* + * Do not mess work of the module coming and going notifiers. + * Note that the patch might still be needed before the going handler + * is called. Module functions can be called even in the GOING state + * until mod->exit() finishes. This is especially important for + * patches that modify semantic of the functions. + */ + if (mod && mod->klp_alive) + obj->mod = mod; + + mutex_unlock(&module_mutex); +} + +/* klp_mutex must be held by caller */ +static bool klp_is_patch_registered(struct klp_patch *patch) +{ + struct klp_patch *mypatch; + + list_for_each_entry(mypatch, &klp_patches, list) + if (mypatch == patch) + return true; + + return false; +} + +static bool klp_initialized(void) +{ + return klp_root_kobj; +} + +struct klp_find_arg { + const char *objname; + const char *name; + unsigned long addr; + /* + * If count == 0, the symbol was not found. If count == 1, a unique + * match was found and addr is set. If count > 1, there is + * unresolvable ambiguity among "count" number of symbols with the same + * name in the same object. + */ + unsigned long count; +}; + +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + /* + * args->addr might be overwritten if another match is found + * but klp_find_object_symbol() handles this and only returns the + * addr if count == 1. + */ + args->addr = addr; + args->count++; + + return 0; +} + +static int klp_find_object_symbol(const char *objname, const char *name, + unsigned long *addr) +{ + struct klp_find_arg args = { + .objname = objname, + .name = name, + .addr = 0, + .count = 0 + }; + + kallsyms_on_each_symbol(klp_find_callback, &args); + + if (args.count == 0) + pr_err("symbol '%s' not found in symbol table\n", name); + else if (args.count > 1) + pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", + args.count, name, objname); + else { + *addr = args.addr; + return 0; + } + + *addr = 0; + return -EINVAL; +} + +struct klp_verify_args { + const char *name; + const unsigned long addr; +}; + +static int klp_verify_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_verify_args *args = data; + + if (!mod && + !strcmp(args->name, name) && + args->addr == addr) + return 1; + + return 0; +} + +static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) +{ + struct klp_verify_args args = { + .name = name, + .addr = addr, + }; + + if (kallsyms_on_each_symbol(klp_verify_callback, &args)) + return 0; + + pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", + name, addr); + return -EINVAL; +} + +static int klp_find_verify_func_addr(struct klp_object *obj, + struct klp_func *func) +{ + int ret; + +#if defined(CONFIG_RANDOMIZE_BASE) + /* KASLR is enabled, disregard old_addr from user */ + func->old_addr = 0; +#endif + + if (!func->old_addr || klp_is_module(obj)) + ret = klp_find_object_symbol(obj->name, func->old_name, + &func->old_addr); + else + ret = klp_verify_vmlinux_symbol(func->old_name, + func->old_addr); + + return ret; +} + +/* + * external symbols are located outside the parent object (where the parent + * object is either vmlinux or the kmod being patched). + */ +static int klp_find_external_symbol(struct module *pmod, const char *name, + unsigned long *addr) +{ + const struct kernel_symbol *sym; + + /* first, check if it's an exported symbol */ + preempt_disable(); + sym = find_symbol(name, NULL, NULL, true, true); + if (sym) { + *addr = sym->value; + preempt_enable(); + return 0; + } + preempt_enable(); + + /* otherwise check if it's in another .o within the patch module */ + return klp_find_object_symbol(pmod->name, name, addr); +} + +static int klp_write_object_relocations(struct module *pmod, + struct klp_object *obj) +{ + int ret; + struct klp_reloc *reloc; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + if (WARN_ON(!obj->relocs)) + return -EINVAL; + + for (reloc = obj->relocs; reloc->name; reloc++) { + if (!klp_is_module(obj)) { + ret = klp_verify_vmlinux_symbol(reloc->name, + reloc->val); + if (ret) + return ret; + } else { + /* module, reloc->val needs to be discovered */ + if (reloc->external) + ret = klp_find_external_symbol(pmod, + reloc->name, + &reloc->val); + else + ret = klp_find_object_symbol(obj->mod->name, + reloc->name, + &reloc->val); + if (ret) + return ret; + } + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, + reloc->val + reloc->addend); + if (ret) { + pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", + reloc->name, reloc->val, ret); + return ret; + } + } + + return 0; +} + +static void notrace klp_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct pt_regs *regs) +{ + struct klp_ops *ops; + struct klp_func *func; + + ops = container_of(fops, struct klp_ops, fops); + + rcu_read_lock(); + func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, + stack_node); + if (WARN_ON_ONCE(!func)) + goto unlock; + + klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); +} + +static void klp_disable_func(struct klp_func *func) +{ + struct klp_ops *ops; + + WARN_ON(func->state != KLP_ENABLED); + WARN_ON(!func->old_addr); + + ops = klp_find_ops(func->old_addr); + if (WARN_ON(!ops)) + return; + + if (list_is_singular(&ops->func_stack)) { + WARN_ON(unregister_ftrace_function(&ops->fops)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); + + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + } else { + list_del_rcu(&func->stack_node); + } + + func->state = KLP_DISABLED; +} + +static int klp_enable_func(struct klp_func *func) +{ + struct klp_ops *ops; + int ret; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + if (WARN_ON(func->state != KLP_DISABLED)) + return -EINVAL; + + ops = klp_find_ops(func->old_addr); + if (!ops) { + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->fops.func = klp_ftrace_handler; + ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; + + list_add(&ops->node, &klp_ops); + + INIT_LIST_HEAD(&ops->func_stack); + list_add_rcu(&func->stack_node, &ops->func_stack); + + ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + goto err; + } + + ret = register_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + goto err; + } + + + } else { + list_add_rcu(&func->stack_node, &ops->func_stack); + } + + func->state = KLP_ENABLED; + + return 0; + +err: + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + return ret; +} + +static void klp_disable_object(struct klp_object *obj) +{ + struct klp_func *func; + + for (func = obj->funcs; func->old_name; func++) + if (func->state == KLP_ENABLED) + klp_disable_func(func); + + obj->state = KLP_DISABLED; +} + +static int klp_enable_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (WARN_ON(obj->state != KLP_DISABLED)) + return -EINVAL; + + if (WARN_ON(!klp_is_object_loaded(obj))) + return -EINVAL; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_enable_func(func); + if (ret) { + klp_disable_object(obj); + return ret; + } + } + obj->state = KLP_ENABLED; + + return 0; +} + +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + + /* enforce stacking: only the last enabled patch can be disabled */ + if (!list_is_last(&patch->list, &klp_patches) && + list_next_entry(patch, list)->state == KLP_ENABLED) + return -EBUSY; + + pr_notice("disabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + if (obj->state == KLP_ENABLED) + klp_disable_object(obj); + } + + patch->state = KLP_DISABLED; + + return 0; +} + +/** + * klp_disable_patch() - disables a registered patch + * @patch: The registered, enabled patch to be disabled + * + * Unregisters the patched functions from ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_disable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + if (patch->state == KLP_DISABLED) { + ret = -EINVAL; + goto err; + } + + ret = __klp_disable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_disable_patch); + +static int __klp_enable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (WARN_ON(patch->state != KLP_DISABLED)) + return -EINVAL; + + /* enforce stacking: only the first disabled patch can be enabled */ + if (patch->list.prev != &klp_patches && + list_prev_entry(patch, list)->state == KLP_DISABLED) + return -EBUSY; + + pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); + add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + + pr_notice("enabling patch '%s'\n", patch->mod->name); + + for (obj = patch->objs; obj->funcs; obj++) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_enable_object(obj); + if (ret) + goto unregister; + } + + patch->state = KLP_ENABLED; + + return 0; + +unregister: + WARN_ON(__klp_disable_patch(patch)); + return ret; +} + +/** + * klp_enable_patch() - enables a registered patch + * @patch: The registered, disabled patch to be enabled + * + * Performs the needed symbol lookups and code relocations, + * then registers the patched functions with ftrace. + * + * Return: 0 on success, otherwise error + */ +int klp_enable_patch(struct klp_patch *patch) +{ + int ret; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + + ret = __klp_enable_patch(patch); + +err: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * Sysfs Interface + * + * /sys/kernel/livepatch + * /sys/kernel/livepatch/<patch> + * /sys/kernel/livepatch/<patch>/enabled + * /sys/kernel/livepatch/<patch>/<object> + * /sys/kernel/livepatch/<patch>/<object>/<func> + */ + +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + unsigned long val; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return -EINVAL; + + if (val != KLP_DISABLED && val != KLP_ENABLED) + return -EINVAL; + + patch = container_of(kobj, struct klp_patch, kobj); + + mutex_lock(&klp_mutex); + + if (val == patch->state) { + /* already in requested state */ + ret = -EINVAL; + goto err; + } + + if (val == KLP_ENABLED) { + ret = __klp_enable_patch(patch); + if (ret) + goto err; + } else { + ret = __klp_disable_patch(patch); + if (ret) + goto err; + } + + mutex_unlock(&klp_mutex); + + return count; + +err: + mutex_unlock(&klp_mutex); + return ret; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); +} + +static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); +static struct attribute *klp_patch_attrs[] = { + &enabled_kobj_attr.attr, + NULL +}; + +static void klp_kobj_release_patch(struct kobject *kobj) +{ + /* + * Once we have a consistency model we'll need to module_put() the + * patch module here. See klp_register_patch() for more details. + */ +} + +static struct kobj_type klp_ktype_patch = { + .release = klp_kobj_release_patch, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = klp_patch_attrs, +}; + +static void klp_kobj_release_func(struct kobject *kobj) +{ +} + +static struct kobj_type klp_ktype_func = { + .release = klp_kobj_release_func, + .sysfs_ops = &kobj_sysfs_ops, +}; + +/* + * Free all functions' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_funcs_limited(struct klp_object *obj, + struct klp_func *limit) +{ + struct klp_func *func; + + for (func = obj->funcs; func->old_name && func != limit; func++) + kobject_put(&func->kobj); +} + +/* Clean up when a patched object is unloaded */ +static void klp_free_object_loaded(struct klp_object *obj) +{ + struct klp_func *func; + + obj->mod = NULL; + + for (func = obj->funcs; func->old_name; func++) + func->old_addr = 0; +} + +/* + * Free all objects' kobjects in the array up to some limit. When limit is + * NULL, all kobjects are freed. + */ +static void klp_free_objects_limited(struct klp_patch *patch, + struct klp_object *limit) +{ + struct klp_object *obj; + + for (obj = patch->objs; obj->funcs && obj != limit; obj++) { + klp_free_funcs_limited(obj, NULL); + kobject_put(obj->kobj); + } +} + +static void klp_free_patch(struct klp_patch *patch) +{ + klp_free_objects_limited(patch, NULL); + if (!list_empty(&patch->list)) + list_del(&patch->list); + kobject_put(&patch->kobj); +} + +static int klp_init_func(struct klp_object *obj, struct klp_func *func) +{ + INIT_LIST_HEAD(&func->stack_node); + func->state = KLP_DISABLED; + + return kobject_init_and_add(&func->kobj, &klp_ktype_func, + obj->kobj, "%s", func->old_name); +} + +/* parts of the initialization that is done only when the object is loaded */ +static int klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (obj->relocs) { + ret = klp_write_object_relocations(patch->mod, obj); + if (ret) + return ret; + } + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_find_verify_func_addr(obj, func); + if (ret) + return ret; + } + + return 0; +} + +static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) +{ + struct klp_func *func; + int ret; + const char *name; + + if (!obj->funcs) + return -EINVAL; + + obj->state = KLP_DISABLED; + obj->mod = NULL; + + klp_find_object_module(obj); + + name = klp_is_module(obj) ? obj->name : "vmlinux"; + obj->kobj = kobject_create_and_add(name, &patch->kobj); + if (!obj->kobj) + return -ENOMEM; + + for (func = obj->funcs; func->old_name; func++) { + ret = klp_init_func(obj, func); + if (ret) + goto free; + } + + if (klp_is_object_loaded(obj)) { + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto free; + } + + return 0; + +free: + klp_free_funcs_limited(obj, func); + kobject_put(obj->kobj); + return ret; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + if (!patch->objs) + return -EINVAL; + + mutex_lock(&klp_mutex); + + patch->state = KLP_DISABLED; + + ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, + klp_root_kobj, "%s", patch->mod->name); + if (ret) + goto unlock; + + for (obj = patch->objs; obj->funcs; obj++) { + ret = klp_init_object(patch, obj); + if (ret) + goto free; + } + + list_add_tail(&patch->list, &klp_patches); + + mutex_unlock(&klp_mutex); + + return 0; + +free: + klp_free_objects_limited(patch, obj); + kobject_put(&patch->kobj); +unlock: + mutex_unlock(&klp_mutex); + return ret; +} + +/** + * klp_unregister_patch() - unregisters a patch + * @patch: Disabled patch to be unregistered + * + * Frees the data structures and removes the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_unregister_patch(struct klp_patch *patch) +{ + int ret = 0; + + mutex_lock(&klp_mutex); + + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto out; + } + + if (patch->state == KLP_ENABLED) { + ret = -EBUSY; + goto out; + } + + klp_free_patch(patch); + +out: + mutex_unlock(&klp_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(klp_unregister_patch); + +/** + * klp_register_patch() - registers a patch + * @patch: Patch to be registered + * + * Initializes the data structure associated with the patch and + * creates the sysfs interface. + * + * Return: 0 on success, otherwise error + */ +int klp_register_patch(struct klp_patch *patch) +{ + int ret; + + if (!klp_initialized()) + return -ENODEV; + + if (!patch || !patch->mod) + return -EINVAL; + + /* + * A reference is taken on the patch module to prevent it from being + * unloaded. Right now, we don't allow patch modules to unload since + * there is currently no method to determine if a thread is still + * running in the patched code contained in the patch module once + * the ftrace registration is successful. + */ + if (!try_module_get(patch->mod)) + return -ENODEV; + + ret = klp_init_patch(patch); + if (ret) + module_put(patch->mod); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_register_patch); + +static void klp_module_notify_coming(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + int ret; + + ret = klp_init_object_loaded(patch, obj); + if (ret) + goto err; + + if (patch->state == KLP_DISABLED) + return; + + pr_notice("applying patch '%s' to loading module '%s'\n", + pmod->name, mod->name); + + ret = klp_enable_object(obj); + if (!ret) + return; + +err: + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + pmod->name, mod->name, ret); +} + +static void klp_module_notify_going(struct klp_patch *patch, + struct klp_object *obj) +{ + struct module *pmod = patch->mod; + struct module *mod = obj->mod; + + if (patch->state == KLP_DISABLED) + goto disabled; + + pr_notice("reverting patch '%s' on unloading module '%s'\n", + pmod->name, mod->name); + + klp_disable_object(obj); + +disabled: + klp_free_object_loaded(obj); +} + +static int klp_module_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct module *mod = data; + struct klp_patch *patch; + struct klp_object *obj; + + if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) + return 0; + + mutex_lock(&klp_mutex); + + /* + * Each module has to know that the notifier has been called. + * We never know what module will get patched by a new patch. + */ + if (action == MODULE_STATE_COMING) + mod->klp_alive = true; + else /* MODULE_STATE_GOING */ + mod->klp_alive = false; + + list_for_each_entry(patch, &klp_patches, list) { + for (obj = patch->objs; obj->funcs; obj++) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; + + if (action == MODULE_STATE_COMING) { + obj->mod = mod; + klp_module_notify_coming(patch, obj); + } else /* MODULE_STATE_GOING */ + klp_module_notify_going(patch, obj); + + break; + } + } + + mutex_unlock(&klp_mutex); + + return 0; +} + +static struct notifier_block klp_module_nb = { + .notifier_call = klp_module_notify, + .priority = INT_MIN+1, /* called late but before ftrace notifier */ +}; + +static int klp_init(void) +{ + int ret; + + ret = klp_check_compiler_support(); + if (ret) { + pr_info("Your compiler is too old; turning off.\n"); + return -EINVAL; + } + + ret = register_module_notifier(&klp_module_nb); + if (ret) + return ret; + + klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); + if (!klp_root_kobj) { + ret = -ENOMEM; + goto unregister; + } + + return 0; + +unregister: + unregister_module_notifier(&klp_module_nb); + return ret; +} + +module_init(klp_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8541bfdfd232..de7a416cca2a 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,11 +1,11 @@ -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) hash_head = classhashentry(key); /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: + * We do an RCU walk of the hash, see lockdep_free_key_range(). */ - list_for_each_entry(class, hash_head, hash_entry) { + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return NULL; + + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; - unsigned long flags; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry(class, hash_head, hash_entry) + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; + } + /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); return NULL; } - raw_local_irq_restore(flags); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); @@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) @@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\n"); dump_stack(); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); - raw_local_irq_restore(flags); out_set_class_cache: if (!subclass || force) @@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, entry->distance = distance; entry->trace = *trace; /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * Both allocation and removal are done under the graph lock; but + * iteration is under RCU-sched; see look_up_lock_class() and + * lockdep_free_key_range(). */ list_add_tail_rcu(&entry->entry, head); @@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, else head = &lock->class->locks_before; - list_for_each_entry(entry, head, entry) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; mark_lock_accessed(entry, lock); @@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry(chain, hash_head, entry) { + list_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (unlikely(!debug_locks)) return; - if (subclass) + if (subclass) { + unsigned long flags; + + if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; register_lock_class(lock, subclass, 1); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + } } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one sync_sched() before getting here, so we're guaranteed + * nobody will look up these exact classes -- they're properly dead but still + * allocated. + */ void lockdep_free_key_range(void *start, unsigned long size) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i; @@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) if (locked) graph_unlock(); raw_local_irq_restore(flags); + + /* + * Wait for any possible iterators from look_up_lock_class() to pass + * before continuing to free the memory they refer to. + * + * sync_sched() is sufficient because the read-side is IRQ disable. + */ + synchronize_sched(); + + /* + * XXX at this point we could return the resources to the pool; + * instead we leak them. We would need to change to bitmap allocators + * instead of the linear allocators we have now. + */ } void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i, j; @@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 4d60986fcbee..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) */ return; } - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* Wait until the lock holder passes the lock down. */ arch_mcs_spin_lock_contended(&node->locked); @@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) static inline void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) { - struct mcs_spinlock *next = ACCESS_ONCE(node->next); + struct mcs_spinlock *next = READ_ONCE(node->next); if (likely(!next)) { /* @@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) if (likely(cmpxchg(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) + while (!(next = READ_ONCE(node->next))) cpu_relax_lowlatency(); } @@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } -/* - * Cancellable version of the MCS lock above. - * - * Intended for adaptive spinning of sleeping locks: - * mutex_lock()/rwsem_down_{read,write}() etc. - */ - -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # value */ -}; - -extern bool osq_lock(struct optimistic_spin_queue *lock); -extern void osq_unlock(struct optimistic_spin_queue *lock); - #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 454195194d4a..4cccea6b8934 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,7 +25,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> -#include "mcs_spinlock.h" +#include <linux/osq_lock.h> /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); * The mutex must later on be released by the same task that * acquired it. Recursive locking is not allowed. The task * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with + * memory where the mutex resides must not be freed with * the mutex still locked. The mutex must first be initialized * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. @@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, } /* - * after acquiring lock with fastpath or when we lost out in contested + * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. * * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, @@ -191,57 +191,61 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, spin_unlock_mutex(&lock->base.wait_lock, flags); } - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. + * After acquiring lock in the slowpath set ctx and wake up any + * waiters so they can recheck. * + * Callers must hold the mutex wait_lock. */ - -/* - * Mutex spinning code migrated from kernel/sched/core.c - */ - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +static __always_inline void +ww_mutex_set_context_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) { - if (lock->owner != owner) - return false; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + lock->ctx = ctx; /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. */ - barrier(); - - return owner->on_cpu; + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } } +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. */ static noinline -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) { + bool ret = true; + rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) + while (lock->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner->on_cpu || need_resched()) { + ret = false; break; + } cpu_relax_lowlatency(); } rcu_read_unlock(); - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; + return ret; } /* @@ -256,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = ACCESS_ONCE(lock->owner); + owner = READ_ONCE(lock->owner); if (owner) retval = owner->on_cpu; rcu_read_unlock(); @@ -307,6 +311,11 @@ static bool mutex_optimistic_spin(struct mutex *lock, if (!mutex_can_spin_on_owner(lock)) goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ if (!osq_lock(&lock->osq)) goto done; @@ -325,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * As such, when deadlock detection needs to be * performed the optimistic spinning cannot be done. */ - if (ACCESS_ONCE(ww->ctx)) + if (READ_ONCE(ww->ctx)) break; } @@ -333,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = ACCESS_ONCE(lock->owner); + owner = READ_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) break; @@ -469,10 +478,10 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) EXPORT_SYMBOL(ww_mutex_unlock); static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); if (!hold_ctx) return 0; @@ -557,7 +566,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); + ret = __ww_mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; } @@ -569,6 +578,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + __set_task_state(task, TASK_RUNNING); + mutex_remove_waiter(lock, &waiter, current_thread_info()); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) @@ -582,23 +593,7 @@ skip_wait: if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + ww_mutex_set_context_slowpath(ww, ww_ctx); } spin_unlock_mutex(&lock->wait_lock, flags); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c index 9887a905a762..dc85ee23a26f 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/osq_lock.c @@ -1,8 +1,6 @@ #include <linux/percpu.h> #include <linux/sched.h> -#include "mcs_spinlock.h" - -#ifdef CONFIG_SMP +#include <linux/osq_lock.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -100,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* * Normally @prev is untouchable after the above store; because at that @@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!smp_load_acquire(&node->locked)) { + while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ @@ -150,7 +148,7 @@ unqueue: * Or we race against a concurrent unqueue()'s step-B, in which * case its step-C will write us a new @node->prev pointer. */ - prev = ACCESS_ONCE(node->prev); + prev = READ_ONCE(node->prev); } /* @@ -172,8 +170,8 @@ unqueue: * it will wait in Step-A. */ - ACCESS_ONCE(next->prev) = prev; - ACCESS_ONCE(prev->next) = next; + WRITE_ONCE(next->prev, prev); + WRITE_ONCE(prev->next, next); return false; } @@ -195,14 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) node = this_cpu_ptr(&osq_node); next = xchg(&node->next, NULL); if (next) { - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); return; } next = osq_wait_next(lock, node, NULL); if (next) - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); } - -#endif - diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7c98873a3077..b73279367087 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * * @task: the task owning the mutex (owner) for which a chain walk is * probably needed - * @deadlock_detect: do we have to carry out deadlock detection? + * @chwalk: do we have to carry out deadlock detection? * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck * things for a task that has just got its priority adjusted, and * is waiting on a mutex) @@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); } + __set_current_state(TASK_RUNNING); return ret; } @@ -1188,12 +1189,13 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); if (likely(!ret)) + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) { - remove_waiter(lock, &waiter); + __set_current_state(TASK_RUNNING); + if (rt_mutex_has_waiters(lock)) + remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); } @@ -1626,10 +1628,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); + /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - set_current_state(TASK_RUNNING); - if (unlikely(ret)) remove_waiter(lock, waiter); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2c93571162cb..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) list_del(&waiter->list); tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); @@ -154,7 +161,7 @@ void __sched __down_read(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); } - tsk->state = TASK_RUNNING; + __set_task_state(tsk, TASK_RUNNING); out: ; } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7628c3fc37ca..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -14,8 +14,9 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/sched/rt.h> +#include <linux/osq_lock.h> -#include "mcs_spinlock.h" +#include "rwsem.h" /* * Guide to the rw_semaphore's count field for common values. @@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); @@ -242,8 +250,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) schedule(); } - tsk->state = TASK_RUNNING; - + __set_task_state(tsk, TASK_RUNNING); return sem; } EXPORT_SYMBOL(rwsem_down_read_failed); @@ -259,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { if (!list_is_singular(&sem->wait_list)) rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + rwsem_set_owner(sem); return true; } @@ -271,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = ACCESS_ONCE(sem->count); + long old, count = READ_ONCE(sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) + if (old == count) { + rwsem_set_owner(sem); return true; + } count = old; } @@ -288,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; - bool on_cpu = false; + bool ret = true; if (need_resched()) return false; rcu_read_lock(); - owner = ACCESS_ONCE(sem->owner); - if (owner) - on_cpu = owner->on_cpu; - rcu_read_unlock(); - - /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath, then there is a possibility reader(s) may have the lock. - * To be safe, avoid spinning in these situations. - */ - return on_cpu; -} - -static inline bool owner_running(struct rw_semaphore *sem, - struct task_struct *owner) -{ - if (sem->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * sem->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); + owner = READ_ONCE(sem->owner); + if (!owner) { + long count = READ_ONCE(sem->count); + /* + * If sem->owner is not set, yet we have just recently entered the + * slowpath with the lock being active, then there is a possibility + * reader(s) may have the lock. To be safe, bail spinning in these + * situations. + */ + if (count & RWSEM_ACTIVE_MASK) + ret = false; + goto done; + } - return owner->on_cpu; + ret = owner->on_cpu; +done: + rcu_read_unlock(); + return ret; } static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { + long count; + rcu_read_lock(); - while (owner_running(sem, owner)) { - if (need_resched()) - break; + while (sem->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + /* abort spinning when need_resched or owner is not running */ + if (!owner->on_cpu || need_resched()) { + rcu_read_unlock(); + return false; + } cpu_relax_lowlatency(); } rcu_read_unlock(); + if (READ_ONCE(sem->owner)) + return true; /* new owner, continue spinning */ + /* - * We break out the loop above on need_resched() or when the - * owner changed, which is a sign for heavy contention. Return - * success only when sem->owner is NULL. + * When the owner is not set, the lock could be free or + * held by readers. Check the counter to verify the + * state. */ - return sem->owner == NULL; + count = READ_ONCE(sem->count); + return (count == 0 || count == RWSEM_WAITING_BIAS); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -359,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) goto done; while (true) { - owner = ACCESS_ONCE(sem->owner); + owner = READ_ONCE(sem->owner); if (owner && !rwsem_spin_on_owner(sem, owner)) break; @@ -433,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = ACCESS_ONCE(sem->count); + count = READ_ONCE(sem->count); /* * If there were already threads queued before us and there are diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -9,29 +9,9 @@ #include <linux/sched.h> #include <linux/export.h> #include <linux/rwsem.h> - #include <linux/atomic.h> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - sem->owner = current; -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - sem->owner = NULL; -} - -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} -#endif +#include "rwsem.h" /* * lock for reading diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h @@ -0,0 +1,20 @@ +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..db3ccb1dd614 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh_nested); + unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/module.c b/kernel/module.c index 3965511ae133..ec53f594e9c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced) return 0; } -unsigned long module_refcount(struct module *mod) +/** + * module_refcount - return the refcount or -1 if unloading + * + * @mod: the module we're checking + * + * Returns: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) { - return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %lu ", module_refcount(mod)); + seq_printf(m, " %i ", module_refcount(mod)); /* * Always include a trailing , so userspace can differentiate @@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } static struct module_attribute modinfo_refcnt = @@ -1216,6 +1225,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, const unsigned long *crc; int err; + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); @@ -1795,7 +1810,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif -void __weak module_free(struct module *mod, void *module_region) +void __weak module_memfree(void *module_region) { vfree(module_region); } @@ -1804,6 +1819,10 @@ void __weak module_arch_cleanup(struct module *mod) { } +void __weak module_arch_freeing_init(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1841,16 +1860,17 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes: */ + /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); + module_memfree(mod->module_core); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2291,11 +2311,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_size += strtab_size; + mod->core_size = debug_align(mod->core_size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; + mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -2457,6 +2479,23 @@ static int elf_header_check(struct load_info *info) return 0; } +#define COPY_CHUNK_SIZE (16*PAGE_SIZE) + +static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) +{ + do { + unsigned long n = min(len, COPY_CHUNK_SIZE); + + if (copy_from_user(dst, usrc, n) != 0) + return -EFAULT; + cond_resched(); + dst += n; + usrc += n; + len -= n; + } while (len); + return 0; +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2476,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (!info->hdr) return -ENOMEM; - if (copy_from_user(info->hdr, umod, info->len) != 0) { + if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { vfree(info->hdr); return -EFAULT; } @@ -2785,7 +2824,7 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_free(mod, mod->module_core); + module_memfree(mod->module_core); return -ENOMEM; } memset(ptr, 0, mod->init_size); @@ -2930,8 +2969,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); + module_arch_freeing_init(mod); + module_memfree(mod->module_init); + module_memfree(mod->module_core); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2963,6 +3003,12 @@ static bool finished_loading(const char *name) struct module *mod; bool ret; + /* + * The module_mutex should not be a heavily contended lock; + * if we get the occasional sleep here, we'll go an extra iteration + * in the wait_event_interruptible(), which is harmless. + */ + sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); ret = !mod || mod->state == MODULE_STATE_LIVE @@ -2983,10 +3029,36 @@ static void do_mod_ctors(struct module *mod) #endif } -/* This is where the real work happens */ -static int do_init_module(struct module *mod) +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct rcu_head rcu; + void *module_init; +}; + +static void do_free_init(struct rcu_head *head) +{ + struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); + module_memfree(m->module_init); + kfree(m); +} + +/* + * This is where the real work happens. + * + * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb + * helper command 'lx-symbols'. + */ +static noinline int do_init_module(struct module *mod) { int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->module_init; /* * We want to find out whether @mod uses async during init. Clear @@ -2999,18 +3071,7 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* - * Init routine failed: abort. Try to protect us from - * buggy refcounters. - */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; + goto fail_free_freeinit; } if (ret > 0) { pr_warn("%s: '%s'->init suspiciously returned %d, it should " @@ -3055,15 +3116,35 @@ static int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif unset_module_init_ro_nx(mod); - module_free(mod, mod->module_init); + module_arch_freeing_init(mod); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; mod->init_text_size = 0; + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, + * we call synchronize_rcu/synchronize_sched, but we don't want + * to slow down the success path, so use actual RCU here. + */ + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; } static int may_init_module(void) @@ -3075,32 +3156,6 @@ static int may_init_module(void) } /* - * Can't use wait_event_interruptible() because our condition - * 'finished_loading()' contains a blocking primitive itself (mutex_lock). - */ -static int wait_finished_loading(struct module *mod) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - add_wait_queue(&module_wq, &wait); - for (;;) { - if (finished_loading(mod->name)) - break; - - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&module_wq, &wait); - - return ret; -} - -/* * We try to place it in the list now to make sure it's unique before * we dedicate too many resources. In particular, temporary percpu * memory exhaustion. @@ -3120,8 +3175,8 @@ again: || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); - - err = wait_finished_loading(mod); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); if (err) goto out_unlocked; goto again; @@ -3220,7 +3275,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->sig_ok = info->sig_ok; if (!mod->sig_ok) { pr_notice_once("%s: module verification failed: signature " - "and/or required key missing - tainting " + "and/or required key missing - tainting " "kernel\n", mod->name); add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } @@ -3334,6 +3389,9 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->module_core, mod->core_size); + module_deallocate(mod, info); free_copy: free_copy(info); diff --git a/kernel/notifier.c b/kernel/notifier.c index 4803da6eab62..ae9fc7cc360e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh, } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); +#ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); +#endif /* CONFIG_SRCU */ + static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/padata.c b/kernel/padata.c index 161402f0b517..b38bea9c466a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst, else cpumask = pinst->cpumask.pcpu; - len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), - nr_cpu_ids); - if (PAGE_SIZE - len < 2) - len = -EINVAL; - else - len += sprintf(buf + len, "\n"); - + len = snprintf(buf, PAGE_SIZE, "%*pb\n", + nr_cpu_ids, cpumask_bits(cpumask)); mutex_unlock(&pinst->lock); - return len; + return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t store_cpumask(struct padata_instance *pinst, diff --git a/kernel/panic.c b/kernel/panic.c index 4d8d6f906dec..8136ad76e5fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -226,6 +226,7 @@ static const struct tnt tnts[] = { { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, { TAINT_SOFTLOCKUP, 'L', ' ' }, + { TAINT_LIVEPATCH, 'K', ' ' }, }; /** @@ -246,6 +247,7 @@ static const struct tnt tnts[] = { * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. * 'L' - A soft lockup has previously occurred. + * 'K' - Kernel has been live patched. * * The string is overwritten by the next call to print_tainted(). */ diff --git a/kernel/params.c b/kernel/params.c index 0af9b2c4e56c..728e05b167de 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -642,12 +642,15 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ + memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + else + mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 48b28d387c7f..7e01f78f0417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -251,6 +251,7 @@ config APM_EMULATION config PM_OPP bool + select SRCU ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/power/process.c b/kernel/power/process.c index 5a6ec8678b9a..564f786df470 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = elapsed_msecs64; if (todo) { - printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " + pr_cont("\n"); + pr_err("Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, @@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, + pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } -static bool __check_frozen_processes(void) -{ - struct task_struct *g, *p; - - for_each_process_thread(g, p) - if (p != current && !freezer_should_skip(p) && !frozen(p)) - return false; - - return true; -} - -/* - * Returns true if all freezable tasks (except for current) are frozen already - */ -static bool check_frozen_processes(void) -{ - bool ret; - - read_lock(&tasklist_lock); - ret = __check_frozen_processes(); - read_unlock(&tasklist_lock); - return ret; -} - /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -142,7 +118,6 @@ static bool check_frozen_processes(void) int freeze_processes(void) { int error; - int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -155,31 +130,24 @@ int freeze_processes(void) atomic_inc(&system_freezing_cnt); pm_wakeup_clear(); - printk("Freezing user space processes ... "); + pr_info("Freezing user space processes ... "); pm_freezing = true; - oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - - /* - * There might have been an OOM kill while we were - * freezing tasks and the killed task might be still - * on the way out so we have to double check for race. - */ - if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { - __usermodehelper_set_disable_depth(UMH_ENABLED); - printk("OOM in progress."); - error = -EBUSY; - } else { - printk("done."); - } + pr_cont("done."); } - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disbale + * the OOM killer to disallow any further interference with + * killable tasks. + */ + if (!error && !oom_killer_disable()) + error = -EBUSY; + if (error) thaw_processes(); return error; @@ -197,13 +165,14 @@ int freeze_kernel_threads(void) { int error; - printk("Freezing remaining freezable tasks ... "); + pr_info("Freezing remaining freezable tasks ... "); + pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) - printk("done."); + pr_cont("done."); - printk("\n"); + pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -224,7 +193,7 @@ void thaw_processes(void) oom_killer_enable(); - printk("Restarting tasks ... "); + pr_info("Restarting tasks ... "); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -243,7 +212,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -252,7 +221,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - printk("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ... "); thaw_workqueues(); @@ -264,5 +233,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - printk("done.\n"); + pr_cont("done.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 5f4c006c4b1e..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -41,6 +41,8 @@ #include <linux/platform_device.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } +static inline int pm_qos_get_value(struct pm_qos_constraints *c); +static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) +{ + struct pm_qos_object *qos = (struct pm_qos_object *)s->private; + struct pm_qos_constraints *c; + struct pm_qos_request *req; + char *type; + unsigned long flags; + int tot_reqs = 0; + int active_reqs = 0; + + if (IS_ERR_OR_NULL(qos)) { + pr_err("%s: bad qos param!\n", __func__); + return -EINVAL; + } + c = qos->constraints; + if (IS_ERR_OR_NULL(c)) { + pr_err("%s: Bad constraints on qos?\n", __func__); + return -EINVAL; + } + + /* Lock to ensure we have a snapshot */ + spin_lock_irqsave(&pm_qos_lock, flags); + if (plist_head_empty(&c->list)) { + seq_puts(s, "Empty!\n"); + goto out; + } + + switch (c->type) { + case PM_QOS_MIN: + type = "Minimum"; + break; + case PM_QOS_MAX: + type = "Maximum"; + break; + case PM_QOS_SUM: + type = "Sum"; + break; + default: + type = "Unknown"; + } + + plist_for_each_entry(req, &c->list, node) { + char *state = "Default"; + + if ((req->node).prio != c->default_value) { + active_reqs++; + state = "Active"; + } + tot_reqs++; + seq_printf(s, "%d: %d: %s\n", tot_reqs, + (req->node).prio, state); + } + + seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n", + type, pm_qos_get_value(c), active_reqs, tot_reqs); + +out: + spin_unlock_irqrestore(&pm_qos_lock, flags); + return 0; +} + +static int pm_qos_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, pm_qos_dbg_show_requests, + inode->i_private); +} + +static const struct file_operations pm_qos_debug_fops = { + .open = pm_qos_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * pm_qos_update_target - manages the constraints list and calls the notifiers * if needed @@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); /* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos) +static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) { qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + if (d) { + (void)debugfs_create_file(qos->name, S_IRUGO, d, + (void *)qos, &pm_qos_debug_fops); + } + return misc_register(&qos->pm_qos_power_miscdev); } @@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void) { int ret = 0; int i; + struct dentry *d; BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + d = debugfs_create_dir("pm_qos", NULL); + if (IS_ERR_OR_NULL(d)) + d = NULL; + for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i]); + ret = register_pm_qos_misc(pm_qos_array[i], d); if (ret < 0) { printk(KERN_ERR "pm_qos_param: %s setup failed\n", pm_qos_array[i]->name); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0c40c16174b4..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } -static bool is_nosave_page(unsigned long pfn) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - if (pfn >= region->start_pfn && pfn < region->end_pfn) { - pr_err("PM: %#010llx in e820 nosave region: " - "[mem %#010llx-%#010llx]\n", - (unsigned long long) pfn << PAGE_SHIFT, - (unsigned long long) region->start_pfn << PAGE_SHIFT, - ((unsigned long long) region->end_pfn << PAGE_SHIFT) - - 1); - return true; - } - } - - return false; -} - /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -1472,9 +1453,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, /** * free_unnecessary_pages - Release preallocated pages not needed for the image */ -static void free_unnecessary_pages(void) +static unsigned long free_unnecessary_pages(void) { - unsigned long save, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem, free; save = count_data_pages(); if (alloc_normal >= save) { @@ -1495,6 +1476,7 @@ static void free_unnecessary_pages(void) else to_free_normal = 0; } + free = to_free_normal + to_free_highmem; memory_bm_position_reset(©_bm); @@ -1518,6 +1500,8 @@ static void free_unnecessary_pages(void) swsusp_unset_page_free(page); __free_page(page); } + + return free; } /** @@ -1707,7 +1691,7 @@ int hibernate_preallocate_memory(void) * pages in memory, but we have allocated more. Release the excessive * ones now. */ - free_unnecessary_pages(); + pages -= free_unnecessary_pages(); out: stop = ktime_get(); @@ -2039,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) + if (likely(pfn_valid(pfn))) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; @@ -2310,8 +2294,6 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static inline int get_safe_write_buffer(void) { return 0; } - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c347e3ce3a55..b7d6b3a721b1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; static const struct platform_freeze_ops *freeze_ops; static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -static bool suspend_freeze_wake; + +enum freeze_state __read_mostly suspend_freeze_state; +static DEFINE_SPINLOCK(suspend_freeze_lock); void freeze_set_ops(const struct platform_freeze_ops *ops) { @@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops) static void freeze_begin(void) { - suspend_freeze_wake = false; + suspend_freeze_state = FREEZE_STATE_NONE; } static void freeze_enter(void) { - cpuidle_use_deepest_state(true); + spin_lock_irq(&suspend_freeze_lock); + if (pm_wakeup_pending()) + goto out; + + suspend_freeze_state = FREEZE_STATE_ENTER; + spin_unlock_irq(&suspend_freeze_lock); + + get_online_cpus(); cpuidle_resume(); - wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + pr_debug("PM: suspend-to-idle\n"); + /* Make the current CPU wait so it can enter the idle loop too. */ + wait_event(suspend_freeze_wait_head, + suspend_freeze_state == FREEZE_STATE_WAKE); + pr_debug("PM: resume from suspend-to-idle\n"); + cpuidle_pause(); - cpuidle_use_deepest_state(false); + put_online_cpus(); + + spin_lock_irq(&suspend_freeze_lock); + + out: + suspend_freeze_state = FREEZE_STATE_NONE; + spin_unlock_irq(&suspend_freeze_lock); } void freeze_wake(void) { - suspend_freeze_wake = true; - wake_up(&suspend_freeze_wait_head); + unsigned long flags; + + spin_lock_irqsave(&suspend_freeze_lock, flags); + if (suspend_freeze_state > FREEZE_STATE_NONE) { + suspend_freeze_state = FREEZE_STATE_WAKE; + wake_up(&suspend_freeze_wait_head); + } + spin_unlock_irqrestore(&suspend_freeze_lock, flags); } EXPORT_SYMBOL_GPL(freeze_wake); diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index cbd69d842341..2ca4a8b5fe57 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -3,7 +3,7 @@ struct console_cmdline { - char name[8]; /* Name of the driver */ + char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02d6b6d28796..bb0635bd74f2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str) early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); +MODULE_PARM_DESC(ignore_loglevel, + "ignore loglevel setting (prints all kernel messages to the console)"); #ifdef CONFIG_BOOT_PRINTK_DELAY @@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len) } /* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. + * Zap console related locks when oopsing. + * To leave time for slow consoles to print a full oops, + * only zap at most once every 30 seconds. */ static void zap_locks(void) { static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; @@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args) #ifdef CONFIG_KGDB_KDB if (unlikely(kdb_trap_printk)) { - r = vkdb_printf(fmt, args); + r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } #endif @@ -2464,6 +2464,7 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && diff --git a/kernel/profile.c b/kernel/profile.c index 54bf5ba26420..a7bcd28d6e9f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -422,8 +422,7 @@ void profile_tick(int type) static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) { - seq_cpumask(m, prof_cpu_mask); - seq_putc(m, '\n'); + seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask)); return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1eb9d90c3af9..227fec36b12a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, } #if defined CONFIG_COMPAT -#include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index e6fae503d1bc..50a808424b06 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,5 @@ -obj-y += update.o srcu.o +obj-y += update.o +obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 07bb02eda844..80adef7d4c3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void); void rcu_early_boot_tests(void); +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4d559baf06e0..30d42aa55d83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -244,7 +244,8 @@ struct rcu_torture_ops { int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); - int (*completed)(void); + unsigned long (*started)(void); + unsigned long (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); @@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU) rcu_read_unlock(); } -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } -static int rcu_no_completed(void) +static unsigned long rcu_no_completed(void) { return 0; } @@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, + .started = rcu_batches_started, + .completed = rcu_batches_completed, .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, @@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) rcu_read_unlock_bh(); } -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - static void rcu_bh_torture_deferred_free(struct rcu_torture *p) { call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); @@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = { .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, @@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, @@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) srcu_read_unlock(&srcu_ctl, idx); } -static int srcu_torture_completed(void) +static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(&srcu_ctl); } @@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .started = NULL, .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, @@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = { .readlock = tasks_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = tasks_torture_read_unlock, + .started = rcu_no_completed, .completed = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, @@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void) static void rcu_torture_timer(unsigned long unused) { int idx; - int completed; - int completed_end; + unsigned long started; + unsigned long completed; static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; @@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused) unsigned long long ts; idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); + started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused) static int rcu_torture_reader(void *arg) { - int completed; - int completed_end; + unsigned long started; + unsigned long completed; int idx; DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; @@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg) mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); - completed = cur_ops->completed(); + if (cur_ops->started) + started = cur_ops->started(); + else + started = cur_ops->completed(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || @@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - completed_end = cur_ops->completed(); + completed = cur_ops->completed(); if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); + ts, started, completed); rcutorture_trace_dump(); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; + completed = completed - started; + if (cur_ops->started) + completed++; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg) cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; + pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", + atomic_read(&barrier_cbs_invoked), + n_barrier_cbs); WARN_ON_ONCE(1); } n_barrier_successes++; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index e037f3eb2f7b..445bf8ffe3fb 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); * Report the number of batches, correlated with, but not necessarily * precisely the same as, the number of grace periods that have elapsed. */ -long srcu_batches_completed(struct srcu_struct *sp) +unsigned long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0db5649f8817..cc9ceca7bde1 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(); /* implies rcu_bh_inc() */ - barrier(); - rcu_dynticks_nesting = newval; -} - /* * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). + * entered that mode. */ void rcu_idle_enter(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_irq_exit(void) { - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - /* * Exit idle, so that we are no longer in an extended quiescent state. */ void rcu_idle_exit(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_irq_enter(void) { - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_irq_enter); @@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); */ bool notrace __rcu_is_watching(void) { - return rcu_dynticks_nesting; + return true; } EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - -/* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). @@ -250,7 +150,7 @@ void rcu_bh_qs(void) void rcu_check_callbacks(int user) { RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) + if (user) rcu_sched_qs(); else if (!in_softirq()) rcu_bh_qs(); @@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head, rcp->curtail = &head->next; RCU_TRACE(rcp->qlen++); local_irq_restore(flags); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_sched_qs() */ + resched_cpu(0); + } } /* @@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 858c56569127..f94e209a10d6 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) rcp->ticks_this_gp++; j = jiffies; js = ACCESS_ONCE(rcp->jiffies_stall); - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, jiffies - rcp->gp_start, rcp->qlen); dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) + } else if (ULONG_CMP_GE(j, js)) { ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); + } } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7680fc275036..48d640ca1a05 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +/* rcuc/rcub kthread realtime priority */ +static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; +module_param(kthread_prio, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); + /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a @@ -284,6 +291,22 @@ void rcu_note_context_switch(void) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); +/* + * Register a quiesecent state for all RCU flavors. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs (but only for those + * RCU flavors in desparate need of a quiescent state, which will normally + * be none of them). Either way, do a lightweight quiescent state for + * all RCU flavors. + */ +void rcu_all_qs(void) +{ + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_qs_ctr); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); /* - * Return the number of RCU-sched batches processed thus far for debug & stats. + * Return the number of RCU batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started(void) +{ + return rcu_state_p->gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started); + +/* + * Return the number of RCU-sched batches started thus far for debug & stats. + */ +unsigned long rcu_batches_started_sched(void) +{ + return rcu_sched_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_sched); + +/* + * Return the number of RCU BH batches started thus far for debug & stats. */ -long rcu_batches_completed_sched(void) +unsigned long rcu_batches_started_bh(void) +{ + return rcu_bh_state.gpnum; +} +EXPORT_SYMBOL_GPL(rcu_batches_started_bh); + +/* + * Return the number of RCU batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed(void) +{ + return rcu_state_p->completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Return the number of RCU-sched batches completed thus far for debug & stats. + */ +unsigned long rcu_batches_completed_sched(void) { return rcu_sched_state.completed; } EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); /* - * Return the number of RCU BH batches processed thus far for debug & stats. + * Return the number of RCU BH batches completed thus far for debug & stats. */ -long rcu_batches_completed_bh(void) +unsigned long rcu_batches_completed_bh(void) { return rcu_bh_state.completed; } @@ -759,39 +818,71 @@ void rcu_irq_enter(void) /** * rcu_nmi_enter - inform RCU of entry to NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. + * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and + * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) */ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + int incby = 2; - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + /* Complain about underflow. */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + smp_mb__before_atomic(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* atomic_inc() before later RCU read-side crit sects */ + smp_mb__after_atomic(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + incby = 1; + } + rdtp->dynticks_nmi_nesting += incby; + barrier(); } /** * rcu_nmi_exit - inform RCU of exit from NMI context * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. */ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + rdtp->dynticks_nmi_nesting -= 2; return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + rdtp->dynticks_nmi_nesting = 0; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); @@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); return 1; } else { + if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4, + rdp->mynode->gpnum)) + ACCESS_ONCE(rdp->gpwrap) = true; return 0; } } /* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) j1 = rcu_jiffies_till_stall_check(); ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; + rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs); +} + +/* + * Complain about starvation of grace-period kthread. + */ +static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +{ + unsigned long gpa; + unsigned long j; + + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + if (j - gpa > 2 * HZ) + pr_err("%s kthread starved for %ld jiffies!\n", + rsp->name, j - gpa); } /* @@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } -static void print_other_cpu_stall(struct rcu_state *rsp) +static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; long delta; unsigned long flags; + unsigned long gpa; + unsigned long j; int ndetected = 0; struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; @@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); } - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else + if (ndetected) { rcu_dump_cpu_stacks(rsp); + } else { + if (ACCESS_ONCE(rsp->gpnum) != gpnum || + ACCESS_ONCE(rsp->completed) == gpnum) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = ACCESS_ONCE(rsp->gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + rsp->name, j - gpa, j, gpa, + jiffies_till_next_fqs); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_check_gp_kthread_starvation(rsp); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp) pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); + + rcu_check_gp_kthread_starvation(rsp); + rcu_dump_cpu_stacks(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); + print_other_cpu_stall(rsp, gpnum); } } @@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { + if (rdp->completed == rnp->completed && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* No grace period end, so just accelerate recent callbacks. */ ret = rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } - if (rdp->gpnum != rnp->gpnum) { + if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); + ACCESS_ONCE(rdp->gpwrap) = false; } return ret; } @@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + rdp->completed == ACCESS_ONCE(rnp->completed) && + !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; @@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); @@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } mutex_unlock(&rsp->onoff_mutex); @@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ @@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + ACCESS_ONCE(rsp->gp_activity) = jiffies; raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; @@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg) ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); + ACCESS_ONCE(rsp->gp_activity) = jiffies; WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { + if ((rdp->passed_quiesce == 0 && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || + rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || + rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce) + if (!rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; /* @@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) } /* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); /* GP memory ordering. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + +/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; - int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); + raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinit &= ~rdp->grpmask; + if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } +static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp, } raw_spin_unlock_irqrestore(&rnp->lock, flags); } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } } /* @@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) * Schedule RCU callback invocation. If the specified type of RCU * does not support RCU priority boosting, just do a direct call, * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the + * are running on the current CPU with softirqs disabled, the * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { + rdp->qs_pending && !rdp->passed_quiesce && + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { + } else if (rdp->qs_pending && + (rdp->passed_quiesce || + rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; } @@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum || + unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp) } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); + smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); @@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; rdp->passed_quiesce = 0; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->qs_pending = 0; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } @@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; + int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct rcu_state *rsp; + struct sched_param sp; struct task_struct *t; + /* Force priority into range. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + if (kthread_prio != kthread_prio_in) + pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", + kthread_prio, kthread_prio_in); + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); } rcu_spawn_nocb_kthreads(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8e7b1843896e..119de399eb2f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,7 +27,6 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> -#include <linux/irq_work.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -172,11 +171,6 @@ struct rcu_node { /* queued on this rcu_node structure that */ /* are blocking the current grace period, */ /* there can be no such task. */ - struct completion boost_completion; - /* Used to ensure that the rt_mutex used */ - /* to carry out the boosting is fully */ - /* released with no future boostee accesses */ - /* before that rt_mutex is re-initialized. */ struct rt_mutex boost_mtx; /* Used only for the priority-boosting */ /* side effect, not as a lock. */ @@ -257,9 +251,12 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ + unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ + /* for rcu_all_qs() invocations. */ bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ + bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -340,14 +337,10 @@ struct rcu_data { #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ + atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ + atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ - atomic_long_t nocb_follower_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -356,8 +349,6 @@ struct rcu_data { struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; /* CBs waiting for GP. */ struct rcu_head **nocb_gp_tail; - long nocb_gp_count; - long nocb_gp_count_lazy; bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ @@ -488,10 +479,14 @@ struct rcu_state { /* due to no GP active. */ unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ + unsigned long gp_activity; /* Time of last GP kthread */ + /* activity in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ + unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ + /* GP start. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors; #define for_each_rcu_flavor(rsp) \ list_for_each_entry((rsp), &rcu_struct_flavors, flavors) -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - /* * RCU implementation internal declarations: */ @@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */ static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ +/* Read out queue lengths for tracing. */ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) { - *ql = atomic_long_read(&rdp->nocb_q_count) + - rdp->nocb_p_count + - atomic_long_read(&rdp->nocb_follower_count) + - rdp->nocb_p_count + rdp->nocb_gp_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + - rdp->nocb_p_count_lazy + - atomic_long_read(&rdp->nocb_follower_count_lazy) + - rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; -} +#ifdef CONFIG_RCU_NOCB_CPU + *ql = atomic_long_read(&rdp->nocb_q_count); + *qll = atomic_long_read(&rdp->nocb_q_count_lazy); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ *ql = 0; *qll = 0; -} #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +} #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ec85cb5d544..0a571e9a0f1d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -34,10 +34,6 @@ #include "../locking/rtmutex_common.h" -/* rcuc/rcub kthread realtime priority */ -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -module_param(kthread_prio, int, 0644); - /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. @@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static char __initdata nocb_buf[NR_CPUS * 5]; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* @@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); /* * Tell them what RCU they are running. @@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -static long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked @@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t, } /* + * Return true if the specified rcu_node structure has tasks that were + * preempted within an RCU read-side critical section. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blkd_tasks); +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ void rcu_read_unlock_special(struct task_struct *t) { - int empty; - int empty_exp; - int empty_exp_now; + bool empty; + bool empty_exp; + bool empty_norm; + bool empty_exp_now; unsigned long flags; struct list_head *np; #ifdef CONFIG_RCU_BOOST @@ -338,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special.b.need_qs) { rcu_preempt_qs(); + t->rcu_read_unlock_special.b.need_qs = false; if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; @@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty = !rcu_preempt_has_tasks(rnp); + empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t) #endif /* #ifdef CONFIG_RCU_BOOST */ /* + * If this was the last task on the list, go see if we + * need to propagate ->qsmaskinit bit clearing up the + * rcu_node tree. + */ + if (!empty && !rcu_preempt_has_tasks(rnp)) + rcu_cleanup_dead_rnp(rnp); + + /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, @@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (drop_boost_mutex) { + if (drop_boost_mutex) rt_mutex_unlock(&rnp->boost_mtx); - complete(&rnp->boost_completion); - } #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp) static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) + if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } #ifdef CONFIG_HOTPLUG_CPU -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* @@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if (list_empty(&rnp->blkd_tasks)) { + if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rnp->exp_tasks = rnp->blkd_tasks.next; @@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void) } /* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ @@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) +/* + * Because there is no preemptible RCU, there can be no readers blocked. + */ +static bool rcu_preempt_has_tasks(struct rcu_node *rnp) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + return false; } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). @@ -1080,7 +951,7 @@ void exit_rcu(void) static void rcu_initiate_boost_trace(struct rcu_node *rnp) { - if (list_empty(&rnp->blkd_tasks)) + if (!rcu_preempt_has_tasks(rnp)) rnp->n_balk_blkd_tasks++; else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) rnp->n_balk_exp_gp_tasks++; @@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp) struct task_struct *t; struct list_head *tb; - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + if (ACCESS_ONCE(rnp->exp_tasks) == NULL && + ACCESS_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - init_completion(&rnp->boost_completion); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ - /* Wait for boostee to be done w/boost_mtx before reinitializing. */ - wait_for_completion(&rnp->boost_completion); - return ACCESS_ONCE(rnp->exp_tasks) != NULL || ACCESS_ONCE(rnp->boost_tasks) != NULL; } @@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { + if (cpumask_weight(cm) == 0) cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state_p); - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); - } + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } static void rcu_prepare_kthreads(int cpu) @@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * completed since we last checked and there are * callbacks not yet ready to invoke. */ - if (rdp->completed != rnp->completed && + if ((rdp->completed != rnp->completed || + unlikely(ACCESS_ONCE(rdp->gpwrap))) && rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) note_gp_changes(rsp, rdp); @@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, fast_no_hz); } @@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + unsigned long ret; +#ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; +#endif /* #ifdef CONFIG_PROVE_RCU */ - /* No-CBs CPUs might have callbacks on any of three lists. */ + /* + * Check count of all no-CBs callbacks awaiting invocation. + * There needs to be a barrier before this function is called, + * but associated with a prior determination that no more + * callbacks would be posted. In the worst case, the first + * barrier in _rcu_barrier() suffices (but the caller cannot + * necessarily rely on this, not a substitute for the caller + * getting the concurrency design right!). There must also be + * a barrier between the following load an posting of a callback + * (if a callback is in fact needed). This is associated with an + * atomic_inc() in the caller. + */ + ret = atomic_long_read(&rdp->nocb_q_count); + +#ifdef CONFIG_PROVE_RCU rhp = ACCESS_ONCE(rdp->nocb_head); if (!rhp) rhp = ACCESS_ONCE(rdp->nocb_gp_head); @@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) cpu, rhp->func); WARN_ON_ONCE(1); } +#endif /* #ifdef CONFIG_PROVE_RCU */ - return !!rhp; + return !!ret; } /* @@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct task_struct *t; /* Enqueue the callback on the nocb list and update counts. */ + atomic_long_add(rhcount, &rdp->nocb_q_count); + /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ old_rhpp = xchg(&rdp->nocb_tail, rhtp); ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ @@ -2288,9 +2169,6 @@ wait_again: /* Move callbacks to wait-for-GP list, which is empty. */ ACCESS_ONCE(rdp->nocb_head) = NULL; rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); - rdp->nocb_gp_count_lazy = - atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); gotcbs = true; } @@ -2338,9 +2216,6 @@ wait_again: /* Append callbacks to follower's "done" list. */ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); *tail = rdp->nocb_gp_head; - atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); - atomic_long_add(rdp->nocb_gp_count_lazy, - &rdp->nocb_follower_count_lazy); smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* @@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); ACCESS_ONCE(rdp->nocb_follower_head) = NULL; tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); - c = atomic_long_xchg(&rdp->nocb_follower_count, 0); - cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); - rdp->nocb_p_count += c; - rdp->nocb_p_count_lazy += cl; /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + trace_rcu_batch_start(rdp->rsp->name, + atomic_long_read(&rdp->nocb_q_count_lazy), + atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; while (list) { next = list->next; @@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) = - rdp->nocb_p_count_lazy - cl; + smp_mb__before_atomic(); /* _add after CB invocation. */ + atomic_long_add(-c, &rdp->nocb_q_count); + atomic_long_add(-cl, &rdp->nocb_q_count_lazy); rdp->n_nocbs_invoked += c; } return 0; @@ -2513,8 +2386,8 @@ void __init rcu_init_nohz(void) cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } - cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 5cdc62e1beeb..fbb6240509ea 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "tree.h" +DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); + static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); + rdp->passed_quiesce, + rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->qs_pending); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, diff --git a/kernel/resource.c b/kernel/resource.c index 0bcebffc4e77..19f2357dfda3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> +#include <linux/resource_ext.h> #include <asm/io.h> @@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr) return err; } +struct resource_entry *resource_list_create_entry(struct resource *res, + size_t extra_size) +{ + struct resource_entry *entry; + + entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL); + if (entry) { + INIT_LIST_HEAD(&entry->node); + entry->res = res ? res : &entry->__res; + } + + return entry; +} +EXPORT_SYMBOL(resource_list_create_entry); + +void resource_list_free(struct list_head *head) +{ + struct resource_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, node) + resource_list_destroy_entry(entry); +} +EXPORT_SYMBOL(resource_list_free); + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c..46be87024875 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg +CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 8a2e230fb86a..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c27e4f8f4879..c0a205101c23 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -420,3 +420,16 @@ u64 local_clock(void) EXPORT_SYMBOL_GPL(cpu_clock); EXPORT_SYMBOL_GPL(local_clock); + +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +u64 __weak running_clock(void) +{ + return local_clock(); +} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 607f852b4d04..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!READ_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; @@ -288,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; + if (!READ_ONCE(x->done)) + return false; - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + /* + * If ->done, we need to wait for complete() to release ->wait.lock + * otherwise we can end up freeing the completion before complete() + * is done referencing it. + * + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders + * the loads of ->done and ->wait.lock such that we cannot observe + * the lock before complete() acquires it while observing the ->done + * after it's acquired the lock. + */ + smp_rmb(); + spin_unlock_wait(&x->wait.lock); + return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..2f7937ee9e3a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update > 0) + lockdep_assert_held(&rq->lock); + + if (rq->clock_skip_update & RQCF_ACT_SKIP) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -304,65 +306,8 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; /* * this_rq_lock - lock this runqueue and disable interrupts. @@ -490,6 +435,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -743,6 +693,23 @@ static inline bool got_nohz_idle_kick(void) bool sched_can_stop_tick(void) { /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; + + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; + + return rt_se->run_list.prev == rt_se->run_list.next; + } + + /* * More than one running task need preemption. * nr_running update is assumed to be visible * after IPI is sent from wakers. @@ -1046,7 +1013,14 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); +} + +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); } #ifdef CONFIG_SMP @@ -1079,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -1814,6 +1796,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } /* @@ -1832,6 +1818,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1839,7 +1828,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + init_dl_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2049,6 +2038,9 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2748,6 +2740,10 @@ again: * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space + * + * WARNING: all callers must re-check need_resched() afterward and reschedule + * accordingly in case an event triggered the need for rescheduling (such as + * an interrupt waking up a task) while preemption was disabled in __schedule(). */ static void __sched __schedule(void) { @@ -2756,7 +2752,6 @@ static void __sched __schedule(void) struct rq *rq; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -2776,6 +2771,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2800,13 +2797,13 @@ need_resched: switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev)) update_rq_clock(rq); next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->skip_clock_update = 0; + rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -2821,8 +2818,6 @@ need_resched: post_schedule(rq); sched_preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) @@ -2842,7 +2837,9 @@ asmlinkage __visible void __sched schedule(void) struct task_struct *tsk = current; sched_submit_work(tsk); - __schedule(); + do { + __schedule(); + } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2877,6 +2874,21 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +static void __sched notrace preempt_schedule_common(void) +{ + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -2892,17 +2904,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; - do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); + preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); @@ -3067,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } @@ -3251,15 +3255,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } /* @@ -3382,6 +3402,20 @@ static bool check_same_owner(struct task_struct *p) return match; } +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) @@ -3510,7 +3544,7 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; - if (dl_policy(policy)) + if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; @@ -4202,17 +4236,10 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static void __cond_resched(void) -{ - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); -} - int __sched _cond_resched(void) { if (should_resched()) { - __cond_resched(); + preempt_schedule_common(); return 1; } return 0; @@ -4237,7 +4264,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) - __cond_resched(); + preempt_schedule_common(); else cpu_relax(); ret = 1; @@ -4253,7 +4280,7 @@ int __sched __cond_resched_softirq(void) if (should_resched()) { local_bh_enable(); - __cond_resched(); + preempt_schedule_common(); local_bh_disable(); return 1; } @@ -4368,36 +4395,29 @@ EXPORT_SYMBOL_GPL(yield_to); * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - long __sched io_schedule_timeout(long timeout) { - struct rq *rq = raw_rq(); + int old_iowait = current->in_iowait; + struct rq *rq; long ret; + current->in_iowait = 1; + if (old_iowait) + blk_schedule_flush_plug(current); + else + blk_flush_plug(current); + delayacct_blkio_start(); + rq = raw_rq(); atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->in_iowait = old_iowait; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); + return ret; } +EXPORT_SYMBOL(io_schedule_timeout); /** * sys_sched_get_priority_max - return maximum RT priority. @@ -4508,9 +4528,10 @@ void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned state; + unsigned long state = p->state; - state = p->state ? __ffs(p->state) + 1 : 0; + if (state) + state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 @@ -4642,6 +4663,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, struct dl_bw *cur_dl_b; unsigned long flags; + if (!cpumask_weight(cur)) + return ret; + rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); trial_cpus = cpumask_weight(trial); @@ -4740,7 +4764,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -5331,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long flags; - long cpu = (long)hcpu; - struct dl_bw *dl_b; - switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active(cpu, false); - - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - bool overflow; - int cpus; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) - return notifier_from_errno(-EBUSY); - } + set_cpu_active((long)hcpu, false); return NOTIFY_OK; + default: + return NOTIFY_DONE; } - - return NOTIFY_DONE; } static int __init migration_init(void) @@ -5408,9 +5409,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; - char str[256]; - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5423,7 +5422,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -5442,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5468,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %*pbl", + cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); @@ -5826,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -5937,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -6248,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ @@ -7013,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, */ case CPU_ONLINE: - case CPU_DOWN_FAILED: cpuset_update_active_cpus(true); break; default: @@ -7025,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + unsigned long flags; + long cpu = (long)hcpu; + struct dl_bw *dl_b; + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (overflow) + return notifier_from_errno(-EBUSY); + } cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: @@ -7171,8 +7177,8 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); - init_dl_rq(&rq->dl, rq); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -7212,7 +7218,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -7250,6 +7256,11 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7259,11 +7270,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ @@ -7292,13 +7298,12 @@ void __might_sleep(const char *file, int line, int preempt_offset) * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - if (WARN_ONCE(current->state != TASK_RUNNING, + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " "state=%lx set at [<%p>] %pS\n", current->state, (void *)current->task_state_change, - (void *)current->task_state_change)) - __set_current_state(TASK_RUNNING); + (void *)current->task_state_change); ___might_sleep(file, line, preempt_offset); } @@ -7325,6 +7330,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) in_atomic(), irqs_disabled(), current->pid, current->comm); + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -7588,6 +7596,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + for_each_process_thread(g, p) { if (rt_task(p) && task_group(p) == tg) return 1; @@ -7680,6 +7694,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, { int i, err = 0; + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); @@ -7736,9 +7761,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; - if (rt_period == 0) - return -EINVAL; - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -7795,7 +7817,7 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_constraints(void) +static int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); @@ -7896,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, if (ret) goto undo; - ret = sched_rt_global_constraints(); + ret = sched_dl_global_validate(); if (ret) goto undo; - ret = sched_dl_global_constraints(); + ret = sched_rt_global_constraints(); if (ret) goto undo; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 539ca3ce071b..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { + if (later_mask && + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && @@ -186,6 +187,26 @@ out: } /* + * cpudl_set_freecpu - Set the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_set_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_set_cpu(cpu, cp->free_cpus); +} + +/* + * cpudl_clear_freecpu - Clear the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_clear_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_clear_cpu(cpu, cp->free_cpus); +} + +/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ @@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp) if (!cp->elements) return -ENOMEM; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { kfree(cp->elements); return -ENOMEM; } @@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp) for_each_possible_cpu(i) cp->elements[i].idx = IDX_INVALID; - cpumask_setall(cp->free_cpus); - return 0; } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 020039bd1326..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); #endif /* CONFIG_SMP */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b52092f2636d..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) dl_b->total_bw = 0; } -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->rb_root = RB_ROOT; @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -350,6 +396,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -506,16 +557,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); + unsigned long flags; struct rq *rq; -again: - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (rq != task_rq(p)) { - /* Task was moved, retrying. */ - raw_spin_unlock(&rq->lock); - goto again; - } + rq = task_rq_lock(p, &flags); /* * We need to take care of several possible races here: @@ -536,25 +581,52 @@ again: sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (task_on_rq_queued(p)) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); + #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } #endif + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; } + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif unlock: - raw_spin_unlock(&rq->lock); + task_rq_unlock(rq, p, &flags); return HRTIMER_NORESTART; } @@ -613,10 +685,9 @@ static void update_curr_dl(struct rq *rq) dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -853,7 +924,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); @@ -898,7 +969,14 @@ static void yield_task_dl(struct rq *rq) rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } + update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP @@ -1073,7 +1151,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + /* + * Even when we have runtime, update_curr_dl() might have resulted in us + * not being the leftmost task anymore. In that case NEED_RESCHED will + * be set and schedule() will start a new hrtick for the next task. + */ + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } @@ -1094,6 +1178,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1165,9 +1250,6 @@ static int find_later_rq(struct task_struct *task) * We have to consider system topology and task affinity * first, then we can look for a suitable cpu. */ - cpumask_copy(later_mask, task_rq(task)->rd->span); - cpumask_and(later_mask, later_mask, cpu_active_mask); - cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1562,6 +1644,7 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } @@ -1573,6 +1656,7 @@ static void rq_offline_dl(struct rq *rq) dl_clear_overload(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } void init_sched_dl_class(void) @@ -1614,8 +1698,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p) static void switched_from_dl(struct rq *rq, struct task_struct *p) { + /* XXX we should retain the bw until 0-lag */ cancel_dl_timer(rq, p); - __dl_clear_params(p); /* @@ -1638,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) { int check_resched = 1; - /* - * If p is throttled, don't consider the possibility - * of preempting rq->curr, the check will be done right - * after its runtime will get replenished. - */ - if (unlikely(p->dl.dl_throttled)) - return; - if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92cc52001e74..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.running_avg_sum); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN @@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -305,6 +309,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); + PN(clock_task); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); @@ -635,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40667cbf371b..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -670,17 +670,18 @@ static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -1197,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; - long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1212,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ orig_src_load = env->src_stats.load; - orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* @@ -1610,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -1674,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -1730,7 +1744,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) nodes = node_online_map; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; - nodemask_t max_group; + nodemask_t max_group = NODE_MASK_NONE; int a, b; /* Are there nodes at this distance from each other? */ @@ -1764,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) } } /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; nodes = max_group; } return nid; @@ -2081,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; @@ -2162,8 +2180,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma)) { continue; + } /* * Shared library pages mapped by multiple processes are not @@ -2498,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2526,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2539,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta_w; delta -= delta_w; @@ -2549,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta; return decayed; } @@ -2574,11 +2607,13 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2614,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2667,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2685,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2704,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2720,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2732,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2818,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2836,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -3173,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -4299,6 +4367,11 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -4712,6 +4785,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -5157,7 +5257,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } set_skip_buddy(se); @@ -5838,12 +5938,12 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5914,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) @@ -5940,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -5949,26 +6039,19 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } + used = div_u64(avg, total); - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - total >>= SCHED_CAPACITY_SHIFT; - - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) @@ -5983,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; - sdg->sgc->capacity_orig = capacity; - - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); - else - capacity *= default_scale_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6006,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6018,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6038,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -6061,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -6131,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ + return false; +} - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running <= sgs->group_weight) + return false; - return capacity_factor; + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; + + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -6199,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; + sgs->group_usage += get_cpu_usage(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) @@ -6221,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6347,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) { - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - sgs->group_type = group_classify(sg, sgs); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6538,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6605,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6625,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6685,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6714,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6724,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* @@ -6772,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6871,6 +6961,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6880,8 +6973,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -7581,22 +7672,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7610,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } @@ -7657,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e666..4d207d2abcbd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> +#include <linux/suspend.h> #include <asm/tlb.h> @@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!tif_need_resched() && + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -80,6 +82,7 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -103,25 +106,37 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. */ - next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { local_irq_enable(); - else - arch_cpu_idle(); + goto exit_idle; + } - goto exit_idle; + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); } - + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -143,8 +158,7 @@ use_default: * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ - if (broadcast && - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + if (broadcast && tick_broadcast_enter()) goto use_default; /* Take note of the planned idle state. */ @@ -161,12 +175,13 @@ use_default: idle_set_state(this_rq(), NULL); if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + tick_broadcast_exit(); /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); @@ -179,6 +194,19 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ee15f5a0d1c1..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include <linux/slab.h> +#include <linux/irq_work.h> int sched_rr_timeslice = RR_TIMESLICE; @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +#ifdef CONFIG_SMP +static void push_irq_work_func(struct irq_work *work); +#endif + +void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; int i; @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); #endif +#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } @@ -831,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled check_preempt_curr() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_skip_update(rq, false); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -1337,7 +1352,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); @@ -1614,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* @@ -1760,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) ; } +#ifdef HAVE_RT_PUSH_IPI +/* + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. + * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. + */ +static int rto_next_cpu(struct rq *rq) +{ + int prev_cpu = rq->rt.push_cpu; + int cpu; + + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + + /* + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. + */ + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + + } else if (cpu >= nr_cpu_ids) { + /* + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. + */ + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; + + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; +} + +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu; + + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } + + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; + + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + + irq_work_queue_on(&rq->rt.push_work, cpu); +} + +/* Called from hardirq context */ +static void try_to_push_tasks(void *arg) +{ + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; + int cpu; + + this_cpu = rt_rq->push_cpu; + + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); + push_rt_task(rq); + raw_spin_unlock(&rq->lock); + } + + /* Pass the IPI to the next rt overloaded queue */ + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } + + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); +} +#endif /* HAVE_RT_PUSH_IPI */ + static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; @@ -1775,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) */ smp_rmb(); +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return 0; + } +#endif + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a2a45c970e7..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/irq_work.h> #include <linux/tick.h> #include <linux/slab.h> @@ -362,8 +363,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; @@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* RT IPI pull logic requires IRQ_WORK */ +#ifdef CONFIG_IRQ_WORK +# define HAVE_RT_PUSH_IPI +#endif + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -435,7 +447,13 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; #endif +#endif /* CONFIG_SMP */ int rt_queued; int rt_throttled; @@ -558,8 +576,6 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif - int skip_clock_update; - /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -588,6 +604,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; + unsigned int clock_skip_update; u64 clock; u64 clock_task; @@ -598,6 +615,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ @@ -687,16 +705,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} + static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_skip_update |= RQCF_REQ_SKIP; + else + rq->clock_skip_update &= ~RQCF_REQ_SKIP; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -789,7 +826,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -1350,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else @@ -1362,6 +1408,82 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1549,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index a476bea17fbc..87e2c9f0c33e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -15,11 +15,6 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; if (v == (void *)1) { seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); @@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); + seq_printf(seq, "domain%d %*pb", dcount++, + cpumask_pr_args(sched_domain_span(sd))); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", @@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v) rcu_read_unlock(); #endif } - kfree(mask_str); return 0; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4ef9687ac115..4f44028943e6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) switch (action) { case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ + /* Set low-order bits as an errno, capped at MAX_ERRNO. */ + if (data > MAX_ERRNO) + data = MAX_ERRNO; syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; diff --git a/kernel/signal.c b/kernel/signal.c index 16a305295256..a390499943e4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals); */ SYSCALL_DEFINE0(restart_syscall) { - struct restart_block *restart = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; return restart->fn(restart); } @@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; @@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); schedule(); set_restore_sigmask(); return -ERESTARTNOHAND; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f032fb5284e3..40190f28db35 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -280,6 +280,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) unsigned int cpu; int ret = 0; + get_online_cpus(); mutex_lock(&smpboot_threads_lock); for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); @@ -292,6 +293,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) list_add(&plug_thread->list, &hotplug_threads); out: mutex_unlock(&smpboot_threads_lock); + put_online_cpus(); return ret; } EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); diff --git a/kernel/softirq.c b/kernel/softirq.c index 501baa9ac1be..479e4436f787 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if (preempt_count() == cnt) { +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); +#endif trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } } EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu) * in the task stack here. */ __do_softirq(); - rcu_note_context_switch(); local_irq_enable(); - cond_resched(); + cond_resched_rcu_qs(); return; } local_irq_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index a8c9f5a7dda6..a03d9cd23ed7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -97,6 +97,12 @@ #ifndef MPX_DISABLE_MANAGEMENT # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) #endif +#ifndef GET_FP_MODE +# define GET_FP_MODE(a) (-EINVAL) +#endif +#ifndef SET_FP_MODE +# define SET_FP_MODE(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -1102,6 +1108,7 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -1121,7 +1128,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); @@ -2210,11 +2217,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_ENABLE_MANAGEMENT(me); break; case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_DISABLE_MANAGEMENT(me); break; + case PR_SET_FP_MODE: + error = SET_FP_MODE(me, arg2); + break; + case PR_GET_FP_MODE: + error = GET_FP_MODE(me); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137c7f69b264..ce410bb9f2e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = &zero, + }, + { .procname = "nr_pdflush_threads", .mode = 0444 /* read-only */, .proc_handler = pdflush_proc_obsolete, @@ -1248,7 +1256,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1257,7 +1264,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = &zero, }, #endif { @@ -1280,7 +1286,6 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = &zero, }, #endif { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 670fff88a961..21f82c29c914 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); - int rc; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return rc; - } + genlmsg_end(skb, reply); return genlmsg_reply(skb, info); } @@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb, void *reply = genlmsg_data(genlhdr); int rc, delcount = 0; - rc = genlmsg_end(skb, reply); - if (rc < 0) { - nlmsg_free(skb); - return; - } + genlmsg_end(skb, reply); rc = 0; down_read(&listeners->sem); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d626dc98e8df..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET config GENERIC_CLOCKEVENTS bool -# Migration helper. Builds, but does not invoke -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS - # Architecture can handle broadcast in a driver-agnostic way config ARCH_HAS_TICK_BROADCAST bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f622cf28628a..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,16 +1,14 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) obj-y += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o -obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o -obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index a7077d3ae52f..1b001ed1edb9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 55449909f114..25d942d1da27 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); +static int __clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + /* Transition with legacy set_mode() callback */ + if (dev->set_mode) { + /* Legacy callback doesn't support new modes */ + if (state > CLOCK_EVT_STATE_ONESHOT) + return -ENOSYS; + /* + * 'clock_event_state' and 'clock_event_mode' have 1-to-1 + * mapping until *_ONESHOT, and so a simple cast will work. + */ + dev->set_mode((enum clock_event_mode)state, dev); + dev->mode = (enum clock_event_mode)state; + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: + /* + * This is an internal state, which is guaranteed to go from + * SHUTDOWN to DETACHED. No driver interaction required. + */ + return 0; + + case CLOCK_EVT_STATE_SHUTDOWN: + return dev->set_state_shutdown(dev); + + case CLOCK_EVT_STATE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + return dev->set_state_periodic(dev); + + case CLOCK_EVT_STATE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + return dev->set_state_oneshot(dev); + + default: + return -ENOSYS; + } +} + /** - * clockevents_set_mode - set the operating mode of a clock event device + * clockevents_set_state - set the operating state of a clock event device * @dev: device to modify - * @mode: new mode + * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) +void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->mode != mode) { - dev->set_mode(mode, dev); - dev->mode = mode; + if (dev->state != state) { + if (__clockevents_set_state(dev, state)) + return; + + dev->state = state; /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (state == CLOCK_EVT_STATE_ONESHOT) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->set_mode) { + dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); + dev->mode = CLOCK_EVT_MODE_RESUME; + } else if (dev->tick_resume) { + ret = dev->tick_resume(dev); + } + + return ret; +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffie */ @@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; /* Shortcut for clockevent devices that can deal with ktime. */ @@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) continue; if (!tick_check_replacement(newdev, dev)) @@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + if (ced->state == CLOCK_EVT_STATE_DETACHED) { list_del_init(&ced->list); return 0; } @@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind); +/* Sanity check of state transition callbacks */ +static int clockevents_sanity_check(struct clock_event_device *dev) +{ + /* Legacy set_mode() callback */ + if (dev->set_mode) { + /* We shouldn't be supporting new modes now */ + WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || + dev->set_state_shutdown || dev->tick_resume); + + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* New state-specific callbacks */ + if (!dev->set_state_shutdown) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !dev->set_state_periodic) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && + !dev->set_state_oneshot) + return -EINVAL; + + return 0; +} + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(clockevents_sanity_check(dev)); + + /* Initialize state to DETACHED */ + dev->state = CLOCK_EVT_STATE_DETACHED; + if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); @@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + if (dev->state == CLOCK_EVT_STATE_ONESHOT) return clockevents_program_event(dev, dev->next_event, false); - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + if (dev->state == CLOCK_EVT_STATE_PERIODIC) + return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev) * @old: device to release (can be NULL) * @new: device to request (can be NULL) * - * Called from the notifier chain. clockevents_lock is held already + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { - unsigned long flags; - - local_irq_save(flags); /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); - clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); clockevents_shutdown(new); } - local_irq_restore(flags); } /** @@ -541,74 +642,40 @@ void clockevents_resume(void) dev->resume(dev); } -#ifdef CONFIG_GENERIC_CLOCKEVENTS +#ifdef CONFIG_HOTPLUG_CPU /** - * clockevents_notify - notification about relevant events - * Returns 0 on success, any other value on error + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ -int clockevents_notify(unsigned long reason, void *arg) +void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); - switch (reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, arg); - break; - - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - ret = tick_broadcast_oneshot_control(reason); - break; - - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(arg); - break; - - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(arg); - tick_shutdown_broadcast(arg); - tick_shutdown(arg); - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + tick_shutdown_broadcast_oneshot(cpu); + tick_shutdown_broadcast(cpu); + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); list_del(&dev->list); - /* - * Now check whether the CPU has left unused per cpu devices - */ - cpu = *((int *)arg); - list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { - if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1 && - !tick_is_broadcast_device(dev)) { - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - list_del(&dev->list); - } } - break; - default: - break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); - return ret; } -EXPORT_SYMBOL_GPL(clockevents_notify); +#endif #ifdef CONFIG_SYSFS struct bus_type clockevents_subsys = { @@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void) } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */ - -#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b79f39bda7e1..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -34,82 +34,6 @@ #include "tick-internal.h" #include "timekeeping_internal.h" -void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp) -{ - tc->cc = cc; - tc->cycle_last = cc->read(cc); - tc->nsec = start_tstamp; -} -EXPORT_SYMBOL_GPL(timecounter_init); - -/** - * timecounter_read_delta - get nanoseconds since last call of this function - * @tc: Pointer to time counter - * - * When the underlying cycle counter runs over, this will be handled - * correctly as long as it does not run over more than once between - * calls. - * - * The first call to this function for a new time counter initializes - * the time tracking and returns an undefined result. - */ -static u64 timecounter_read_delta(struct timecounter *tc) -{ - cycle_t cycle_now, cycle_delta; - u64 ns_offset; - - /* read cycle counter: */ - cycle_now = tc->cc->read(tc->cc); - - /* calculate the delta since the last timecounter_read_delta(): */ - cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; - - /* convert to nanoseconds: */ - ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); - - /* update time stamp of timecounter_read_delta() call: */ - tc->cycle_last = cycle_now; - - return ns_offset; -} - -u64 timecounter_read(struct timecounter *tc) -{ - u64 nsec; - - /* increment time by nanoseconds since last call */ - nsec = timecounter_read_delta(tc); - nsec += tc->nsec; - tc->nsec = nsec; - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_read); - -u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp) -{ - u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (cycle_delta > tc->cc->mask / 2) { - cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); - } else { - nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); - /** * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks * @mult: pointer to mult variable @@ -218,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) schedule_work(&watchdog_work); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) -{ - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - __clocksource_unstable(cs); -} - /** * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable @@ -250,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow, delta; + cycle_t csnow, wdnow, cslast, wdlast, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -289,6 +206,8 @@ static void clocksource_watchdog(unsigned long data) delta = clocksource_delta(csnow, cs->cs_last, cs->mask); cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; cs->cs_last = csnow; cs->wd_last = wdnow; @@ -297,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - clocksource_unstable(cs, cs_nsec - wd_nsec); + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, csnow, cslast, cs->mask); + __clocksource_unstable(cs); continue; } @@ -545,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @shift: cycle to nanosecond divisor (power of two) * @maxadj: maximum adjustment value to mult (~11%) * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) + * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger then what the math used can handle without overflows. */ -u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the - * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) - * which is equivalent to the below. - * max_cycles < (2^63)/(mult + maxadj) - * max_cycles < 2^(log2((2^63)/(mult + maxadj))) - * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) - * max_cycles < 2^(63 - log2(mult + maxadj)) - * max_cycles < 1 << (63 - log2(mult + maxadj)) - * Please note that we add 1 to the result of the log2 to account for - * any rounding errors, ensure the above inequality is satisfied and - * no overflow will occur. + * cyc2ns() function without overflowing a 64-bit result. */ - max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); /* * The actual maximum number of cycles we can defer the clocksource is @@ -575,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) max_cycles = min(max_cycles, mask); max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + return max_nsecs; } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated * */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static inline void clocksource_update_max_deferment(struct clocksource *cs) { - u64 max_nsecs; - - max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, - cs->mask); - /* - * To ensure that the clocksource does not wrap whilst we are idle, - * limit the time the clocksource can be deferred by 12.5%. Please - * note a margin of 12.5% is used because this can be computed with - * a shift, versus say 10% which would require division. - */ - return max_nsecs - (max_nsecs >> 3); + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -724,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs) } /** - * __clocksource_updatefreq_scale - Used update clocksource with new freq + * __clocksource_update_freq_scale - Used update clocksource with new freq * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale @@ -732,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs) * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. */ -void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; + /* - * Calc the maximum number of seconds which we can run before - * wrapping around. For clocksources which have a mask > 32bit - * we need to limit the max sleep time to have a good - * conversion precision. 10 minutes is still a reasonable - * amount. That results in a shift value of 24 for a - * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. We apply the same 12.5% - * margin as we do in clocksource_max_deferment() + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. */ - sec = (cs->mask - (cs->mask >> 3)); - do_div(sec, freq); - do_div(sec, scale); - if (!sec) - sec = 1; - else if (sec > 600 && cs->mask > UINT_MAX) - sec = 600; - - clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC / scale, sec * scale); - + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } /* - * for clocksources that have large mults, to avoid overflow. - * Since mult may be adjusted by ntp, add an safety extra margin - * + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. */ cs->maxadj = clocksource_max_adjustment(cs); - while ((cs->mult + cs->maxadj < cs->mult) - || (cs->mult - cs->maxadj > cs->mult)) { + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { cs->mult >>= 1; cs->shift--; cs->maxadj = clocksource_max_adjustment(cs); } - cs->max_idle_ns = clocksource_max_deferment(cs); + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources @@ -790,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { /* Initialize mult/shift and max_idle_ns */ - __clocksource_updatefreq_scale(cs, scale, freq); + __clocksource_update_freq_scale(cs, scale, freq); /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); @@ -802,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); - -/** - * clocksource_register - Used to install new clocksources - * @cs: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *cs) -{ - /* calculate max adjustment for given mult/shift */ - cs->maxadj = clocksource_max_adjustment(cs); - WARN_ONCE(cs->mult + cs->maxadj < cs->mult, - "Clocksource %s might overflow on 11%% adjustment\n", - cs->name); - - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); - - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL(clocksource_register); - static void __clocksource_change_rating(struct clocksource *cs, int rating) { list_del(&cs->list); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 37e50aadd471..76d4bd962b19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,7 +54,7 @@ #include <trace/events/timer.h> -#include "timekeeping.h" +#include "tick-internal.h" /* * The timer bases: @@ -122,7 +122,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); boot = ktime_add(mono, off_boot); xtim = ktime_add(mono, off_real); - tai = ktime_add(xtim, off_tai); + tai = ktime_add(mono, off_tai); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; @@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /* * Divide a ktime value by a nanosecond value */ -u64 ktime_divns(const ktime_t kt, s64 div) +u64 __ktime_divns(const ktime_t kt, s64 div) { u64 dclc; int sft = 0; @@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) return dclc; } -EXPORT_SYMBOL_GPL(ktime_divns); +EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* @@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next.tv64 < 0) + expires_next.tv64 = 0; + return expires_next; +} +#endif + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - int i; - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires, expires_next; - - expires_next.tv64 = KTIME_MAX; - - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - timer = container_of(next, struct hrtimer, node); - - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - /* - * clock_was_set() has changed base->offset so the - * result might be negative. Fix it up to prevent a - * false positive in clockevents_program_event() - */ - if (expires.tv64 < 0) - expires.tv64 = 0; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; - } + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, return 0; /* + * When the target cpu of the timer is currently executing + * hrtimer_interrupt(), then we do not touch the clock event + * device. hrtimer_interrupt() will reevaluate all clock bases + * before reprogramming the device. + */ + if (cpu_base->in_hrtirq) + return 0; + + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry * which we enforced in the hang detection. We want the system @@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + ktime_t mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; - int i; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) { - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { - struct hrtimer *timer; - struct timerqueue_node *next; - - next = timerqueue_getnext(&base->active); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - delta.tv64 = hrtimer_get_expires_tv64(timer); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; - } - } + if (!hrtimer_hres_active()) + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), + ktime_get()); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock(&cpu_base->lock); entry_time = now = hrtimer_update_base(cpu_base); retry: - expires_next.tv64 = KTIME_MAX; + cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1291,28 +1291,20 @@ retry: * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - - if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (expires.tv64 < 0) - expires.tv64 = KTIME_MAX; - if (expires.tv64 < expires_next.tv64) - expires_next = expires; + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - } __run_hrtimer(timer, &basenow); } } - + /* Reevaluate the clock bases for the next expiry */ + expires_next = __hrtimer_get_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ @@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; @@ -1715,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - case CPU_DYING_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); - break; case CPU_DEAD: case CPU_DEAD_FROZEN: - { - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); migrate_hrtimers(scpu); break; - } #endif default: diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -25,7 +25,7 @@ #include <linux/module.h> #include <linux/init.h> -#include "tick-internal.h" +#include "timekeeping.h" /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on @@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, + .max_cycles = 10, }; __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); @@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies); static int __init init_jiffies_clocksource(void) { - return clocksource_register(&clocksource_jiffies); + return __clocksource_register(&clocksource_jiffies); } core_initcall(init_jiffies_clocksource); @@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; - clocksource_register(&refined_jiffies); + __clocksource_register(&refined_jiffies); return 0; } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,7 +17,6 @@ #include <linux/module.h> #include <linux/rtc.h> -#include "tick-internal.h" #include "ntp_internal.h" /* @@ -459,6 +458,16 @@ out: return leap; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + struct timespec now; + + now = timespec64_to_timespec(now64); + return update_persistent_clock(now); +} +#endif + #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); @@ -488,14 +497,15 @@ static void sync_cmos_clock(struct work_struct *work) getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { - struct timespec adjust = timespec64_to_timespec(now); + struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(adjust); + fail = update_persistent_clock64(adjust); #endif + #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) fail = rtc_set_ntp_time(adjust); @@ -633,6 +643,17 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + return 0; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a16b67859e2a..0075da74abf0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + struct restart_block *restart_block = ¤t->restart_block; struct itimerspec it; int error; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,5 +1,6 @@ /* - * sched_clock.c: support for extending counters to full 64-bit ns counter + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,15 +19,53 @@ #include <linux/seqlock.h> #include <linux/bitops.h> -struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; +}; + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); }; static struct hrtimer sched_clock_timer; @@ -34,12 +73,6 @@ static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, +}; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq; - - if (cd.suspended) - return cd.epoch_ns; + struct clock_read_data *rd; do { - seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); + + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq)); - cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; +} + +/* + * Updating the data required to read the clock. + * + * sched_clock() will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); } /* - * Atomically update the sched_clock epoch. + * Atomically update the sched_clock() epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data rd; + + rd = cd.read_data[0]; + + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + update_clock_read_data(&rd); } static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { update_sched_clock(); hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init sched_clock_register(u64 (*read)(void), int bits, - unsigned long rate) +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_read_data rd; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - /* calculate the mult/shift to convert counter ticks to ns. */ + /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; + + /* Calculate how many nanosecs until we risk wrapping */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); + cd.wrap_kt = ns_to_ktime(wrap); - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + rd = cd.read_data[0]; - /* update epoch for new counter and update epoch_ns from old counter*/ + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + cd.actual_read_sched_clock = read; - raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { r /= 1000000; r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate the ns resolution of this counter */ + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, new_mult, new_shift); pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - /* Enable IRQ time accounting if we have a fast enough sched_clock */ + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); @@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits, void __init sched_clock_postinit(void) { /* - * If no sched_clock function has been provided at that point, + * If no sched_clock() function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -189,29 +251,53 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of 'epoch_cyc'. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; +} + static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data[0]; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; + return 0; } static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; } static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, }; static int __init sched_clock_syscore_init(void) { register_syscore_ops(&sched_clock_ops); + return 0; } device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { + int bc_moved; /* * We try to cancel the timer first. If the callback is on * flight on some other cpu then we let it handle it. If we @@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * restart the timer because we are in the callback, but we * can set the expiry time and let the callback return * HRTIMER_RESTART. + * + * Since we are in the idle loop at this point and because + * hrtimer_{start/cancel} functions call into tracing, + * calls to these functions must be bound within RCU_NONIDLE. */ - if (hrtimer_try_to_cancel(&bctimer) >= 0) { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? + !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : + 0); + if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); } else if (bc->bound_on == smp_processor_id()) { diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 066f0ec05e48..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask; static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); -static int tick_broadcast_force; +static int tick_broadcast_forced; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif /* @@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * The device is in periodic mode. No reprogramming necessary: */ - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + if (dev->state == CLOCK_EVT_STATE_PERIODIC) goto unlock; /* @@ -324,49 +326,54 @@ unlock: raw_spin_unlock(&tick_broadcast_lock); } -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -static void tick_do_broadcast_on_off(unsigned long *reason) +void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; int cpu, bc_stopped; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; - bc = tick_broadcast_device.evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; if (!tick_device_is_functional(dev)) - goto out; + return; + raw_spin_lock(&tick_broadcast_lock); + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); - switch (*reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - tick_broadcast_force = 1; break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (tick_broadcast_force) + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (!tick_device_is_functional(dev)) @@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) else tick_broadcast_setup_oneshot(bc); } -out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop. - */ -void tick_broadcast_on_off(unsigned long reason, int *oncpu) -{ - if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) - printk(KERN_ERR "tick-broadcast: ignoring broadcast for " - "offline CPU #%d\n", *oncpu); - else - tick_do_broadcast_on_off(&reason); + raw_spin_unlock(&tick_broadcast_lock); } +EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off @@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) dev->event_handler = tick_handle_periodic_broadcast; } +#ifdef CONFIG_HOTPLUG_CPU /* * Remove a CPU from broadcasting */ -void tick_shutdown_broadcast(unsigned int *cpup) +void tick_shutdown_broadcast(unsigned int cpu) { struct clock_event_device *bc; unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif void tick_suspend_broadcast(void) { @@ -453,38 +448,48 @@ void tick_suspend_broadcast(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -int tick_resume_broadcast(void) +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; - int broadcast = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); - broadcast = cpumask_test_cpu(smp_processor_id(), - tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) - broadcast = tick_resume_broadcast_oneshot(bc); + tick_resume_broadcast_oneshot(bc); break; } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); - - return broadcast; } - #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask; @@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, { int ret; - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + if (bc->state != CLOCK_EVT_STATE_ONESHOT) + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); ret = clockevents_program_event(bc, expires, force); if (!ret) @@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, return ret; } -int tick_resume_broadcast_oneshot(struct clock_event_device *bc) +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return 0; + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_mode(td->evtdev, - CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); } } } @@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -static void broadcast_move_bc(int deadcpu) -{ - struct clock_event_device *bc = tick_broadcast_device.evtdev; - - if (!bc || !broadcast_needs_cpu(bc, deadcpu)) - return; - /* This moves the broadcast assignment to this cpu */ - clockevents_program_event(bc, bc->next_event, 1); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -int tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; - ktime_t now; int cpu, ret = 0; + ktime_t now; /* * Periodic mode does not care about the enter/exit of power @@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) * We are called with preemtion disabled from the depth of the * idle code, so we can't be moved away. */ - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; + raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (state == TICK_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); broadcast_shutdown_local(bc, dev); @@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) } } out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock(&tick_broadcast_lock); return ret; } +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu @@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); tick_broadcast_set_event(bc, cpu, tick_next_period, 1); @@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} /* * Remove a dead CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +void tick_shutdown_broadcast_oneshot(unsigned int cpu) { unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - broadcast_move_bc(cpu); - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif /* * Check, whether the broadcast device is in one shot mode diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7efeedf53ebd..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + if (dev->state != CLOCK_EVT_STATE_ONESHOT) return; for (;;) { /* @@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -332,14 +332,16 @@ out_bc: tick_install_broadcast_device(newdev); } +#ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. * - * Called with interrupts disabled. + * Called with interrupts disabled. Not locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. */ -void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(void) { - if (*cpup == tick_do_timer_cpu) { + if (tick_do_timer_cpu == smp_processor_id()) { int cpu = cpumask_first(cpu_online_mask); tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : @@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; @@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup) * Prevent that the clock events layer tries to call * the set mode function! */ + dev->state = CLOCK_EVT_STATE_DETACHED; dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } } +#endif -void tick_suspend(void) +/** + * tick_suspend_local - Suspend the local tick device + * + * Called from the local cpu for freeze with interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } -void tick_resume(void) +/** + * tick_resume_local - Resume the local tick device + * + * Called from the local CPU for unfreeze or XEN resume magic. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); - int broadcast = tick_resume_broadcast(); - - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + bool broadcast = tick_resume_check_broadcast(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(td->evtdev, 0); @@ -395,6 +412,83 @@ void tick_resume(void) } /** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) + timekeeping_suspend(); + else + tick_suspend_local(); + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) + timekeeping_resume(); + else + tick_resume_local(); + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} + +/** * tick_init - initialize the tick control */ void __init tick_init(void) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 366aeb4f2c66..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -5,15 +5,12 @@ #include <linux/tick.h> #include "timekeeping.h" +#include "tick-sched.h" -extern seqlock_t jiffies_lock; +#ifdef CONFIG_GENERIC_CLOCKEVENTS -#define CS_NAME_LEN 32 - -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD - -#define TICK_DO_TIMER_NONE -1 -#define TICK_DO_TIMER_BOOT -2 +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; @@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_handover_do_timer(int *cpup); -extern void tick_shutdown(unsigned int *cpup); +extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); -extern void clockevents_shutdown(struct clock_event_device *dev); +extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} +extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); +extern void clockevents_handle_noop(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); -/* - * NO_HZ / high resolution timer shared code - */ +/* Broadcasting support */ +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern void tick_install_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_shutdown_broadcast(unsigned int cpu); +extern void tick_suspend_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); +extern void tick_broadcast_init(void); +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_shutdown_broadcast(unsigned int cpu) { } +static inline void tick_suspend_broadcast(void) { } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } + +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ #ifdef CONFIG_TICK_ONESHOT extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ + +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); -extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); -# else /* BROADCAST */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); +#else /* !(BROADCAST && ONESHOT): */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } -static inline bool tick_broadcast_oneshot_available(void) { return true; } -# endif /* !BROADCAST */ - -#else /* !ONESHOT */ -static inline -void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t nextevt) -{ - BUG(); -} -static inline void tick_resume_oneshot(void) -{ - BUG(); -} -static inline int tick_program_event(ktime_t expires, int force) -{ - return 0; -} -static inline void tick_oneshot_notify(void) { } -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } -static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) -{ - return 0; -} -static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline bool tick_broadcast_oneshot_available(void) { return false; } -#endif /* !TICK_ONESHOT */ +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !(BROADCAST && ONESHOT) */ /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL @@ -105,68 +137,3 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif - -/* - * Broadcasting support - */ -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern void tick_install_broadcast_device(struct clock_event_device *dev); -extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); -extern void tick_shutdown_broadcast(unsigned int *cpup); -extern void tick_suspend_broadcast(void); -extern int tick_resume_broadcast(void); -extern void tick_broadcast_init(void); -extern void -tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); -int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); - -#else /* !BROADCAST */ - -static inline void tick_install_broadcast_device(struct clock_event_device *dev) -{ -} - -static inline int tick_is_broadcast_device(struct clock_event_device *dev) -{ - return 0; -} -static inline int tick_device_uses_broadcast(struct clock_event_device *dev, - int cpu) -{ - return 0; -} -static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } -static inline void tick_shutdown_broadcast(unsigned int *cpup) { } -static inline void tick_suspend_broadcast(void) { } -static inline int tick_resume_broadcast(void) { return 0; } -static inline void tick_broadcast_init(void) { } -static inline int tick_broadcast_update_freq(struct clock_event_device *dev, - u32 freq) { return -ENODEV; } - -/* - * Set the periodic handler in non broadcast mode - */ -static inline void tick_set_periodic_handler(struct clock_event_device *dev, - int broadcast) -{ - dev->event_handler = tick_handle_periodic; -} -#endif /* !BROADCAST */ - -/* - * Check, if the device is functional or a dummy for broadcast - */ -static inline int tick_device_is_functional(struct clock_event_device *dev) -{ - return !(dev->features & CLOCK_EVT_FEAT_DUMMY); -} - -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); - -#endif - -extern void do_timer(unsigned long ticks); -extern void update_wall_time(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 7ce740e78e1b..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -38,7 +38,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1363d58f07e9..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -34,7 +34,7 @@ /* * Per cpu nohz control structure */ -DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* * The time, when the last jiffy update happened. Protected by jiffies_lock. @@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, return NOTIFY_OK; } -/* - * Worst case string length in chunks of CPU range seems 2 steps - * separations: 0,2,4,6,... - * This is NR_CPUS + sizeof('\0') - */ -static char __initdata nohz_full_buf[NR_CPUS + 1]; - static int tick_nohz_init_all(void) { int err = -1; @@ -393,8 +386,8 @@ void __init tick_nohz_init(void) context_tracking_cpu_set(cpu); cpu_notifier(tick_nohz_cpu_down_callback, 0); - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); - pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); + pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", + cpumask_pr_args(tick_nohz_full_mask)); } #endif @@ -423,6 +416,11 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); +int tick_nohz_tick_stopped(void) +{ + return __this_cpu_read(tick_cpu_sched.tick_stopped); +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..28b5da3e1a17 --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,74 @@ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include <linux/hrtimer.h> + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @sleep_length: Duration of the current idle sleep + * @do_timer_lst: CPU was the last one doing do_timer before going idle + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + ktime_t last_tick; + int inidle; + int tick_stopped; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + int idle_active; + ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + ktime_t sleep_length; + unsigned long last_jiffies; + unsigned long next_jiffies; + ktime_t idle_expires; + int do_timer_last; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#endif diff --git a/kernel/time/time.c b/kernel/time/time.c index 6390517e77d4..2c85b7724af4 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000000..4687b3104bae --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,112 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + cycle_t cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6a931852082f..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -59,17 +59,15 @@ struct tk_fast { }; static struct tk_fast tk_fast_mono ____cacheline_aligned; +static struct tk_fast tk_fast_raw ____cacheline_aligned; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -/* Flag for if there is a persistent clock on this platform */ -bool __read_mostly persistent_clock_exist = false; - static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { - tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } } @@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) struct timespec64 ts; ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } @@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +#ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ +/* + * These simple flag variables are managed + * without locks, which is racy, but ok since + * we don't really care about being super + * precise about how many events were seen, + * just that a problem was observed. + */ +static int timekeeping_underflow_seen; +static int timekeeping_overflow_seen; + +/* last_warning is only modified under the timekeeping lock */ +static long timekeeping_last_warning; + +static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ + + cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } + + if (timekeeping_underflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_underflow_seen = 0; + } + + if (timekeeping_overflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_overflow_seen = 0; + } +} + +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t now, last, mask, max, delta; + unsigned int seq; + + /* + * Since we're called holding a seqlock, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqlock here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tkr->read(tkr->clock); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + delta = clocksource_delta(now, last, mask); + + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & mask) < (mask >> 3))) { + timekeeping_underflow_seen = 1; + delta = 0; + } + + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > max)) { + timekeeping_overflow_seen = 1; + delta = tkr->clock->max_cycles; + } + + return delta; +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ +} +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t cycle_now, delta; + + /* read clocksource */ + cycle_now = tkr->read(tkr->clock); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} +#endif + /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->tkr.clock; - tk->tkr.clock = clock; - tk->tkr.read = clock->read; - tk->tkr.mask = clock->mask; - tk->tkr.cycle_last = tk->tkr.read(clock); + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.read = clock->read; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + + tk->tkr_raw.clock = clock; + tk->tkr_raw.read = clock->read; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->tkr.xtime_nsec >>= -shift_change; + tk->tkr_mono.xtime_nsec >>= -shift_change; else - tk->tkr.xtime_nsec <<= shift_change; + tk->tkr_mono.xtime_nsec <<= shift_change; } - tk->tkr.shift = clock->shift; + tk->tkr_raw.xtime_nsec = 0; + + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; @@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->tkr.mult = clock->mult; + tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; } @@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; } static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t cycle_now, delta; + cycle_t delta; s64 nsec; - /* read clocksource: */ - cycle_now = tkr->read(tkr->clock); - - /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + delta = timekeeping_get_delta(tkr); nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; @@ -209,30 +323,9 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) -{ - struct clocksource *clock = tk->tkr.clock; - cycle_t cycle_now, delta; - s64 nsec; - - /* read clocksource: */ - cycle_now = tk->tkr.read(clock); - - /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); - - /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); - - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. - * @tk: The timekeeper from which we take the update - * @tkf: The fast timekeeper to update - * @tbase: The time base for the fast timekeeper (mono/raw) + * @tkr: Timekeeping readout base from which we take the update * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. @@ -244,11 +337,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) * smp_wmb(); <- Ensure that the last base[1] update is visible * tkf->seq++; * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[0], tk); + * update(tkf->base[0], tkr); * smp_wmb(); <- Ensure that the base[0] update is visible * tkf->seq++; * smp_wmb(); <- Ensure that the seqcount update is visible - * update(tkf->base[1], tk); + * update(tkf->base[1], tkr); * * The reader side does: * @@ -269,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ -static void update_fast_timekeeper(struct timekeeper *tk) +static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) { - struct tk_read_base *base = tk_fast_mono.base; + struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tk_fast_mono.seq); + raw_write_seqcount_latch(&tkf->seq); /* Update base[0] */ - memcpy(base, &tk->tkr, sizeof(*base)); + memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tk_fast_mono.seq); + raw_write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); @@ -318,22 +411,67 @@ static void update_fast_timekeeper(struct timekeeper *tk) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 notrace ktime_get_mono_fast_ns(void) +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { - seq = raw_read_seqcount(&tk_fast_mono.seq); - tkr = tk_fast_mono.base + (seq & 0x01); - now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + seq = raw_read_seqcount(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + } while (read_seqcount_retry(&tkf->seq, seq)); - } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); return now; } + +u64 ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); +u64 ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + +/* Suspend-time cycles value for halted fast timekeeper. */ +static cycle_t cycles_at_suspend; + +static cycle_t dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + struct tk_read_base *tkr = &tk->tkr_mono; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tkr->read(tkr->clock); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); +} + #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD static inline void update_vsyscall(struct timekeeper *tk) @@ -342,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk) xt = timespec64_to_timespec(tk_xtime(tk)); wm = timespec64_to_timespec(tk->wall_to_monotonic); - update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, - tk->tkr.cycle_last); + update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, + tk->tkr_mono.cycle_last); } static inline void old_vsyscall_fixup(struct timekeeper *tk) @@ -360,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD * users are removed, this can be killed. */ - remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); - tk->tkr.xtime_nsec -= remainder; - tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; + remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) @@ -429,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; - tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* Update the monotonic raw base */ - tk->base_raw = timespec64_to_ktime(tk->raw_time); + tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ - nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; @@ -462,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - update_fast_timekeeper(tk); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** @@ -474,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr.clock; + struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; s64 nsec; - cycle_now = tk->tkr.read(clock); - delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); - tk->tkr.cycle_last = cycle_now; + cycle_now = tk->tkr_mono.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; - tk->tkr.xtime_nsec += delta * tk->tkr.mult; + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; + tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); timespec64_add_ns(&tk->raw_time, nsec); } @@ -510,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts) seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(&tk->tkr); + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -550,8 +690,8 @@ ktime_t ktime_get(void) do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(&tk->tkr); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -576,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); - base = ktime_add(tk->tkr.base_mono, *offset); - nsecs = timekeeping_get_ns(&tk->tkr); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -618,8 +758,8 @@ ktime_t ktime_get_raw(void) do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->base_raw; - nsecs = timekeeping_get_ns_raw(tk); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -647,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(&tk->tkr); + nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -732,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(&tk->tkr); + nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); + nsecs_real = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -916,7 +1056,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->tkr.clock; + old = tk->tkr_mono.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -944,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; - if (tk->tkr.clock == clock) + if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->tkr.clock == clock ? 0 : -1; + return tk->tkr_mono.clock == clock ? 0 : -1; } /** @@ -966,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - nsecs = timekeeping_get_ns_raw(tk); + nsecs = timekeeping_get_ns(&tk->tkr_raw); ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -989,7 +1129,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1008,7 +1148,7 @@ u64 timekeeping_max_deferment(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tkr.clock->max_idle_ns; + ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1030,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_persistent_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_persistent_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + /** * read_boot_clock - Return time of the system start. * @@ -1045,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_boot_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_boot_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + +/* Flag for if timekeeping_resume() has injected sleeptime */ +static bool sleeptime_injected; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -1054,20 +1216,17 @@ void __init timekeeping_init(void) struct clocksource *clock; unsigned long flags; struct timespec64 now, boot, tmp; - struct timespec ts; - read_persistent_clock(&ts); - now = timespec_to_timespec64(ts); + read_persistent_clock64(&now); if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; now.tv_nsec = 0; } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; - read_boot_clock(&ts); - boot = timespec_to_timespec64(ts); + read_boot_clock64(&boot); if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); @@ -1087,7 +1246,6 @@ void __init timekeeping_init(void) tk_set_xtime(tk, &now); tk->raw_time.tv_sec = 0; tk->raw_time.tv_nsec = 0; - tk->base_raw.tv64 = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -1100,7 +1258,7 @@ void __init timekeeping_init(void) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } -/* time in seconds when suspend began */ +/* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** @@ -1125,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_debug_account_sleep_time(delta); } +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) +/** + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return sleeptime_injected; +} + +/** + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). + * + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * - * This hook is for architectures that cannot support read_persistent_clock + * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. @@ -1140,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - /* - * Make sure we don't set the clock twice, as timekeeping_resume() - * already did it - */ - if (has_persistent_clock()) - return; - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); @@ -1162,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) /* signal hrtimers about time change */ clock_was_set(); } +#endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. */ -static void timekeeping_resume(void) +void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr.clock; + struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - struct timespec tmp; cycle_t cycle_now, cycle_delta; - bool suspendtime_found = false; - read_persistent_clock(&tmp); - ts_new = timespec_to_timespec64(tmp); + sleeptime_injected = false; + read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); @@ -1201,16 +1384,16 @@ static void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr.read(clock); + cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > tk->tkr.cycle_last) { + cycle_now > tk->tkr_mono.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, - tk->tkr.mask); + cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -1226,17 +1409,19 @@ static void timekeeping_resume(void) nsec += ((u64) cycle_delta * mult) >> shift; ts_delta = ns_to_timespec64(nsec); - suspendtime_found = true; + sleeptime_injected = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); - suspendtime_found = true; + sleeptime_injected = true; } - if (suspendtime_found) + if (sleeptime_injected) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->tkr.cycle_last = cycle_now; + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1245,22 +1430,18 @@ static void timekeeping_resume(void) touch_softlockup_watchdog(); - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ + tick_resume(); hrtimers_resume(); } -static int timekeeping_suspend(void) +int timekeeping_suspend(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; - struct timespec tmp; - read_persistent_clock(&tmp); - timekeeping_suspend_time = timespec_to_timespec64(tmp); + read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at @@ -1268,38 +1449,41 @@ static int timekeeping_suspend(void) * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; - /* - * To avoid drift caused by repeated suspend/resumes, - * which each can add ~1 second drift error, - * try to compensate so the difference in system time - * and persistent_clock time stays close to constant. - */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec64_sub(delta, old_delta); - if (abs(delta_delta.tv_sec) >= 2) { + if (persistent_clock_exists) { /* - * if delta_delta is too large, assume time correction - * has occured and set old_delta to the current delta. + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. */ - old_delta = delta; - } else { - /* Otherwise try to adjust old_system to compensate */ - timekeeping_suspend_time = - timespec64_add(timekeeping_suspend_time, delta_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } } timekeeping_update(tk, TK_MIRROR); + halt_fast_timekeeper(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + tick_suspend(); clocksource_suspend(); clockevents_suspend(); @@ -1388,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * * XXX - TODO: Doc ntp_error calculation. */ - if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } - tk->tkr.mult += mult_adj; + tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; - tk->tkr.xtime_nsec -= offset; + tk->tkr_mono.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; } @@ -1458,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) tk->ntp_err_mult = 0; } - if (unlikely(tk->tkr.clock->maxadj && - (abs(tk->tkr.mult - tk->tkr.clock->mult) - > tk->tkr.clock->maxadj))) { + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->tkr.clock->name, (long)tk->tkr.mult, - (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* @@ -1481,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { - s64 neg = -(s64)tk->tkr.xtime_nsec; - tk->tkr.xtime_nsec = 0; + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr_mono.xtime_nsec; + tk->tkr_mono.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } } @@ -1498,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; - while (tk->tkr.xtime_nsec >= nsecps) { + while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; - tk->tkr.xtime_nsec -= nsecps; + tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ @@ -1549,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->tkr.cycle_last += interval; + tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; - tk->tkr.xtime_nsec += tk->xtime_interval << shift; + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1594,14 +1779,17 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), - tk->tkr.cycle_last, tk->tkr.mask); + offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval) goto out; + /* Do some additional sanity checking */ + timekeeping_check_update(real_tk, offset); + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, @@ -1659,24 +1847,24 @@ out: } /** - * getboottime - Return the real time of system boot. - * @ts: pointer to the timespec to be set + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set * - * Returns the wall-time of boot in a timespec. + * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ -void getboottime(struct timespec *ts) +void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); - *ts = ktime_to_timespec(t); + *ts = ktime_to_timespec64(t); } -EXPORT_SYMBOL_GPL(getboottime); +EXPORT_SYMBOL_GPL(getboottime64); unsigned long get_seconds(void) { @@ -1756,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; + base = tk->tkr_mono.base; + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; @@ -1788,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(&tk->tkr); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index adc1fc98bde3..ead8794b9a4e 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -16,5 +16,14 @@ extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); extern void timekeeping_clocktai(struct timespec *ts); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 #endif diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..2ece3aa5069c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -90,8 +90,18 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; +/* + * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've + * made NULL special, hint: lock_timer_base()) and we cannot get a compile time + * pointer to per-cpu entries because we don't know where we'll map the section, + * even for the boot cpu. + * + * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the + * rest of them. + */ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); + static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ @@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); + /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int init_timers_cpu(int cpu) -{ - int j; - struct tvec_base *base; - static char tvec_base_done[NR_CPUS]; - - if (!tvec_base_done[cpu]) { - static char boot_done; - - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kzalloc_node(sizeof(*base), GFP_KERNEL, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ - if (WARN_ON(base != tbase_get_base(base))) { - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; - } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; - } - spin_lock_init(&base->lock); - tvec_base_done[cpu] = 1; - base->cpu = cpu; - } else { - base = per_cpu(tvec_bases, cpu); - } - - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; - base->active_timers = 0; - base->all_timers = 0; - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) { @@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } + old_base->active_timers = 0; + old_base->all_timers = 0; + spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } -#endif /* CONFIG_HOTPLUG_CPU */ static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - long cpu = (long)hcpu; - int err; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = init_timers_cpu(cpu); - if (err < 0) - return notifier_from_errno(err); - break; -#ifdef CONFIG_HOTPLUG_CPU + switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers(cpu); + migrate_timers((long)hcpu); break; -#endif default: break; } + return NOTIFY_OK; } -static struct notifier_block timers_nb = { - .notifier_call = timer_cpu_notify, -}; +static inline void timer_register_cpu_notifier(void) +{ + cpu_notifier(timer_cpu_notify, 0); +} +#else +static inline void timer_register_cpu_notifier(void) { } +#endif /* CONFIG_HOTPLUG_CPU */ +static void __init init_timer_cpu(struct tvec_base *base, int cpu) +{ + int j; -void __init init_timers(void) + BUG_ON(base != tbase_get_base(base)); + + base->cpu = cpu; + per_cpu(tvec_bases, cpu) = base; + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; +} + +static void __init init_timer_cpus(void) { - int err; + struct tvec_base *base; + int local_cpu = smp_processor_id(); + int cpu; + for_each_possible_cpu(cpu) { + if (cpu == local_cpu) + base = &boot_tvec_bases; +#ifdef CONFIG_SMP + else + base = per_cpu_ptr(&__tvec_bases, cpu); +#endif + + init_timer_cpu(base, cpu); + } +} + +void __init init_timers(void) +{ /* ensure there are enough low bits for flags in timer->base pointer */ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - BUG_ON(err != NOTIFY_OK); - + init_timer_cpus(); init_timer_stats(); - register_cpu_notifier(&timers_nb); + timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,10 +16,10 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> -#include <linux/tick.h> #include <asm/uaccess.h> +#include "tick-internal.h" struct timer_list_iter { int cpu; @@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); - SEQ_printf(m, "\n"); + if (dev->set_mode) { + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + } else { + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } + + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); + } + } SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 979ccde26720..98f26588255e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,11 +3,11 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation -CFLAGS_trace_selftest_dynamic.o = -pg +CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) obj-y += trace_selftest_dynamic.o endif endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fcc0e7052a79..5a2e0b53af30 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1059,6 +1059,12 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_graph_active; +#else +# define ftrace_graph_active 0 +#endif + #ifdef CONFIG_DYNAMIC_FTRACE static struct ftrace_ops *removed_ops; @@ -2041,8 +2047,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (!ftrace_rec_count(rec)) rec->flags = 0; else - /* Just disable the record (keep REGS state) */ - rec->flags &= ~FTRACE_FL_ENABLED; + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN); } return FTRACE_UPDATE_MAKE_NOP; @@ -2688,24 +2698,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) static void ftrace_startup_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* Force update next time */ saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } } static void ftrace_shutdown_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_DISABLE_CALLS); + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } } static cycle_t ftrace_update_time; @@ -5558,12 +5580,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { - ftrace_startup_sysctl(); - /* we are starting ftrace again */ if (ftrace_ops_list != &ftrace_list_end) update_ftrace_function(); + ftrace_startup_sysctl(); + } else { /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; @@ -5590,8 +5612,6 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; -static int ftrace_graph_active; - int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 1c71382b283d..eb4220a132ec 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96079180de3d..5040d44fe5a3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -445,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s) struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; + wait_queue_head_t full_waiters; bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; }; /* @@ -527,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); + if (rbwork->wakeup_full) { + rbwork->wakeup_full = false; + wake_up_all(&rbwork->full_waiters); + } } /** @@ -551,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * data in any cpu buffer, or a specific buffer, put the * caller on the appropriate wait queue. */ - if (cpu == RING_BUFFER_ALL_CPUS) + if (cpu == RING_BUFFER_ALL_CPUS) { work = &buffer->irq_work; - else { + /* Full only makes sense on per cpu reads */ + full = false; + } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -ENODEV; cpu_buffer = buffer->buffers[cpu]; @@ -562,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) while (true) { - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); /* * The events can happen in critical sections where @@ -584,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * that is necessary is that the wake up happens after * a task has been queued. It's OK for spurious wake ups. */ - work->waiters_pending = true; + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; if (signal_pending(current)) { ret = -EINTR; @@ -613,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) schedule(); } - finish_wait(&work->waiters, &wait); + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); return ret; } @@ -1228,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -2799,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static __always_inline void rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { + bool pagebusy; + if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -2810,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) /* irq_work_queue() supplies it's own memory barriers */ irq_work_queue(&cpu_buffer->irq_work.work); } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c8913bac204..bcfa2add6dda 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3355,12 +3355,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); - if (count - len < 2) { + len = snprintf(mask_str, count, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)); + if (len >= count) { count = -EINVAL; goto out_err; } - len += sprintf(mask_str + len, "\n"); count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); out_err: @@ -4941,7 +4941,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - for (i = 0; i < nr_pages; i++){ + for (i = nr_pages - 1; i >= 0; i--) { kunmap_atomic(map_page[i]); put_page(pages[i]); } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4b9c114ee9de..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) } void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) + struct pt_regs **regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; + if (regs) + *regs = this_cpu_ptr(&__perf_regs[*rctxp]); raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c1c6655847c8..ed998fbf09ce 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; @@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) return; diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index f8b45d8792f9..e694c9f9efa4 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, __trace_seq_init(s); - seq_buf_bitmask(&s->seq, maskp, nmaskbits); + seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp); if (unlikely(seq_buf_has_overflowed(&s->seq))) { s->seq.len = save_len; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c6ee36fcbf90..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size -= sizeof(u32); rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, regs, &rctx); + sys_data->enter_event->event.type, NULL, &rctx); if (!rec) return; @@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size -= sizeof(u32); rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, regs, &rctx); + sys_data->exit_event->event.type, NULL, &rctx); if (!rec) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 5f0eba9e5e6b..7dc1c8abecd6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); if (!entry) goto out; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 70bf11815f84..3174bf8e3538 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,7 @@ static int get_softlockup_thresh(void) */ static unsigned long get_timestamp(void) { - return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ + return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6202b08f1933..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -159,6 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ + struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -230,7 +231,7 @@ struct wq_device; */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ - struct list_head list; /* PL: list of all workqueues */ + struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ @@ -257,6 +258,13 @@ struct workqueue_struct { #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + /* + * Destruction of workqueue_struct is sched-RCU protected to allow + * walking the workqueues list without grabbing wq_pool_mutex. + * This is used to dump all workqueues from sysrq. + */ + struct rcu_head rcu; + /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ @@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ @@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); +static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -1841,17 +1850,11 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { - if (!need_to_create_worker(pool)) - return false; restart: spin_unlock_irq(&pool->lock); @@ -1877,7 +1880,6 @@ restart: */ if (need_to_create_worker(pool)) goto restart; - return true; } /** @@ -1897,16 +1899,14 @@ restart: * multiple times. Does GFP_KERNEL allocations. * * Return: - * %false if the pool don't need management and the caller can safely start - * processing works, %true indicates that the function released pool->lock - * and reacquired it to perform some management function and that the - * conditions that the caller verified while holding the lock before - * calling the function might no longer be true. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Anyone who successfully grabs manager_arb wins the arbitration @@ -1919,12 +1919,14 @@ static bool manage_workers(struct worker *worker) * actual management, the pool may stall indefinitely. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; + pool->manager = worker; - ret |= maybe_create_worker(pool); + maybe_create_worker(pool); + pool->manager = NULL; mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** @@ -2312,6 +2314,7 @@ repeat: struct wq_barrier { struct work_struct work; struct completion done; + struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) @@ -2360,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + barr->task = current; /* * If @target is currently being executed, schedule the @@ -2737,19 +2741,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2758,6 +2800,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } @@ -2950,324 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -#ifdef CONFIG_SYSFS -/* - * Workqueues with WQ_SYSFS flag set is visible to userland via - * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the - * following attributes. - * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items - * - * Unbound workqueues have the following extra attributes. - * - * id RO int : the associated pool ID - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers - */ -struct wq_device { - struct workqueue_struct *wq; - struct device dev; -}; - -static struct workqueue_struct *dev_to_wq(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - return wq_dev->wq; -} - -static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); -} -static DEVICE_ATTR_RO(per_cpu); - -static ssize_t max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); -} - -static ssize_t max_active_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int val; - - if (sscanf(buf, "%d", &val) != 1 || val <= 0) - return -EINVAL; - - workqueue_set_max_active(wq, val); - return count; -} -static DEVICE_ATTR_RW(max_active); - -static struct attribute *wq_sysfs_attrs[] = { - &dev_attr_per_cpu.attr, - &dev_attr_max_active.attr, - NULL, -}; -ATTRIBUTE_GROUPS(wq_sysfs); - -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - rcu_read_lock_sched(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); - - return written; -} - -static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); - mutex_unlock(&wq->mutex); - - return written; -} - -/* prepare workqueue_attrs for sysfs store operations */ -static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) -{ - struct workqueue_attrs *attrs; - - attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!attrs) - return NULL; - - mutex_lock(&wq->mutex); - copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); - return attrs; -} - -static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); - else - ret = -EINVAL; - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static ssize_t wq_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); - mutex_unlock(&wq->mutex); - - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - return written; -} - -static ssize_t wq_cpumask_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - ret = cpumask_parse(buf, attrs->cpumask); - if (!ret) - ret = apply_workqueue_attrs(wq, attrs); - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); - } - - free_workqueue_attrs(attrs); - return ret ?: count; -} - -static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), - __ATTR(nice, 0644, wq_nice_show, wq_nice_store), - __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), - __ATTR_NULL, -}; - -static struct bus_type wq_subsys = { - .name = "workqueue", - .dev_groups = wq_sysfs_groups, -}; - -static int __init wq_sysfs_init(void) -{ - return subsys_virtual_register(&wq_subsys, NULL); -} -core_initcall(wq_sysfs_init); - -static void wq_device_release(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - kfree(wq_dev); -} - -/** - * workqueue_sysfs_register - make a workqueue visible in sysfs - * @wq: the workqueue to register - * - * Expose @wq in sysfs under /sys/bus/workqueue/devices. - * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set - * which is the preferred method. - * - * Workqueue user should use this function directly iff it wants to apply - * workqueue_attrs before making the workqueue visible in sysfs; otherwise, - * apply_workqueue_attrs() may race against userland updating the - * attributes. - * - * Return: 0 on success, -errno on failure. - */ -int workqueue_sysfs_register(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev; - int ret; - - /* - * Adjusting max_active or creating new pwqs by applyting - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; - - wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); - if (!wq_dev) - return -ENOMEM; - - wq_dev->wq = wq; - wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; - wq_dev->dev.release = wq_device_release; - - /* - * unbound_attrs are created separately. Suppress uevent until - * everything is ready. - */ - dev_set_uevent_suppress(&wq_dev->dev, true); - - ret = device_register(&wq_dev->dev); - if (ret) { - kfree(wq_dev); - wq->wq_dev = NULL; - return ret; - } - - if (wq->flags & WQ_UNBOUND) { - struct device_attribute *attr; - - for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { - ret = device_create_file(&wq_dev->dev, attr); - if (ret) { - device_unregister(&wq_dev->dev); - wq->wq_dev = NULL; - return ret; - } - } - } - - dev_set_uevent_suppress(&wq_dev->dev, false); - kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); - return 0; -} - -/** - * workqueue_sysfs_unregister - undo workqueue_sysfs_register() - * @wq: the workqueue to unregister - * - * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. - */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev = wq->wq_dev; - - if (!wq->wq_dev) - return; - - wq->wq_dev = NULL; - device_unregister(&wq_dev->dev); -} -#else /* CONFIG_SYSFS */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } -#endif /* CONFIG_SYSFS */ - /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free @@ -3386,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool) return 0; } +static void rcu_free_wq(struct rcu_head *rcu) +{ + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); + + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); + else + free_workqueue_attrs(wq->unbound_attrs); + + kfree(wq->rescuer); + kfree(wq); +} + static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); @@ -3563,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) /* * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Free it. + * is gonna access it anymore. Schedule RCU free. */ - if (is_last) { - free_workqueue_attrs(wq->unbound_attrs); - kfree(wq); - } + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** @@ -4105,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); - list_add(&wq->list, &workqueues); + list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); @@ -4161,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); - list_del_init(&wq->list); + list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); - if (wq->rescuer) { + if (wq->rescuer) kthread_stop(wq->rescuer->task); - kfree(wq->rescuer); - wq->rescuer = NULL; - } if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly - * free the pwqs and wq. + * schedule RCU free. */ - free_percpu(wq->cpu_pwqs); - kfree(wq); + call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly @@ -4399,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) } } +static void pr_cont_pool_info(struct worker_pool *pool) +{ + pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); + if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr; + + barr = container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + pr_info(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + pr_info(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + pr_info(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + pr_info(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + /* * CPU hotplug. * @@ -4796,6 +4698,323 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} +static DEVICE_ATTR_RO(per_cpu); + +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} +static DEVICE_ATTR_RW(max_active); + +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, +}; +ATTRIBUTE_GROUPS(wq_sysfs); + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq->unbound_attrs->cpumask)); + mutex_unlock(&wq->mutex); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_groups = wq_sysfs_groups, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Return: 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; + + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + dev_set_uevent_suppress(&wq_dev->dev, false); + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; |