diff options
Diffstat (limited to 'kernel')
59 files changed, 1810 insertions, 1074 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..6e699100872f 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,7 +1,5 @@ # # Generated files # -config_data.h -config_data.gz timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6aa7543bcdb2..6c57e78817da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n -$(obj)/configs.o: $(obj)/config_data.h +$(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - -filechk_ikconfiggz = \ - echo "static const char kernel_config_data[] __used = MAGIC_START"; \ - cat $< | scripts/bin2c; \ - echo "MAGIC_END;" - -targets += config_data.h -$(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call filechk,ikconfiggz) diff --git a/kernel/async.c b/kernel/async.c index a893d6170944..f6bd0d9885e1 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -149,7 +149,25 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) +/** + * async_schedule_node_domain - NUMA specific version of async_schedule_domain + * @func: function to execute asynchronously + * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to + * @domain: the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. + * + * Note: This function may be called from atomic or non-atomic contexts. + * + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. + */ +async_cookie_t async_schedule_node_domain(async_func_t func, void *data, + int node, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -195,43 +213,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy current->flags |= PF_USED_ASYNC; /* schedule for execution */ - queue_work(system_unbound_wq, &entry->work); + queue_work_node(node, system_unbound_wq, &entry->work); return newcookie; } +EXPORT_SYMBOL_GPL(async_schedule_node_domain); /** - * async_schedule - schedule a function for asynchronous execution + * async_schedule_node - NUMA specific version of async_schedule * @func: function to execute asynchronously * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule(async_func_t func, void *data) -{ - return __async_schedule(func, data, &async_dfl_domain); -} -EXPORT_SYMBOL_GPL(async_schedule); - -/** - * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @func: function to execute asynchronously - * @data: data pointer to pass to the function - * @domain: the domain * - * Returns an async_cookie_t that may be used for checkpointing later. - * @domain may be used in the async_synchronize_*_domain() functions to - * wait within a certain synchronization domain rather than globally. A - * synchronization domain is specified via @domain. Note: This function - * may be called from atomic or non-atomic contexts. + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. */ -async_cookie_t async_schedule_domain(async_func_t func, void *data, - struct async_domain *domain) +async_cookie_t async_schedule_node(async_func_t func, void *data, int node) { - return __async_schedule(func, data, domain); + return async_schedule_node_domain(func, data, node, &async_dfl_domain); } -EXPORT_SYMBOL_GPL(async_schedule_domain); +EXPORT_SYMBOL_GPL(async_schedule_node); /** * async_synchronize_full - synchronize all asynchronous function calls diff --git a/kernel/audit.c b/kernel/audit.c index 632d36059556..c89ea48c70a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, struct audit_buffer *ab; int rc = 0; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); + audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_context *context, + struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); @@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return; } - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); @@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) audit_log_task_context(*ab); } +static inline void audit_log_user_recv_msg(struct audit_buffer **ab, + u16 msg_type) +{ + audit_log_common_recv_msg(NULL, ab, msg_type); +} + int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); @@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - audit_log_common_recv_msg(&ab, msg_type); + audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, @@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " op=%s audit_enabled=%d res=0", + msg_type == AUDIT_ADD_RULE ? + "add_rule" : "remove_rule", + audit_enabled); audit_log_end(ab); return -EPERM; } @@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); @@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, @@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - if (cap_isclear(*cap)) { - audit_log_format(ab, " %s=0", prefix); - return; - } - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) - audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - audit_log_cap(ab, "cap_fp", &name->fcap.permitted); - audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); -} - -static inline int audit_copy_fcaps(struct audit_names *name, - const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> - VFS_CAP_REVISION_SHIFT; - - return 0; -} - -/* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - -/** - * audit_log_name - produce AUDIT_PATH record from struct audit_names - * @context: audit_context for the task - * @n: audit_names structure with reportable details - * @path: optional path to report instead of audit_names->name - * @record_num: record number to report when handling a list of names - * @call_panic: optional pointer to int that will be updated if secid fails - */ -void audit_log_name(struct audit_context *context, struct audit_names *n, - const struct path *path, int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; - - audit_log_format(ab, "item=%d", record_num); - - if (path) - audit_log_d_path(ab, " name=", path); - else if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - /* log the audit_names record type */ - switch(n->type) { - case AUDIT_TYPE_NORMAL: - audit_log_format(ab, " nametype=NORMAL"); - break; - case AUDIT_TYPE_PARENT: - audit_log_format(ab, " nametype=PARENT"); - break; - case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, " nametype=DELETE"); - break; - case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, " nametype=CREATE"); - break; - default: - audit_log_format(ab, " nametype=UNKNOWN"); - break; - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); -} - int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; @@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation) audit_log_end(ab); } +/* global counter which is incremented every time something logs in */ +static atomic_t session_id = ATOMIC_INIT(0); + +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) + && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, + unsigned int sessionid, int rc) +{ + struct audit_buffer *ab; + uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + + uid = from_kuid(&init_user_ns, task_uid(current)); + oldloginuid = from_kuid(&init_user_ns, koldloginuid); + loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(); + + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); + audit_log_end(ab); +} + +/** + * audit_set_loginuid - set current task's loginuid + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ +int audit_set_loginuid(kuid_t loginuid) +{ + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; + kuid_t oldloginuid; + int rc; + + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); + + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) { + sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == AUDIT_SID_UNSET)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } + + current->sessionid = sessionid; + current->loginuid = loginuid; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit.h b/kernel/audit.h index 91421679a168..958d5b8fc1b3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -69,6 +69,7 @@ struct audit_cap_data { kernel_cap_t effective; /* effective set of process */ }; kernel_cap_t ambient; + kuid_t rootid; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump @@ -212,15 +213,6 @@ extern bool audit_ever_enabled; extern void audit_log_session_info(struct audit_buffer *ab); -extern void audit_copy_inode(struct audit_names *name, - const struct dentry *dentry, - struct inode *inode); -extern void audit_log_cap(struct audit_buffer *ab, char *prefix, - kernel_cap_t *cap); -extern void audit_log_name(struct audit_context *context, - struct audit_names *n, const struct path *path, - int record_num, int *call_panic); - extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 @@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab, extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); -/* audit watch functions */ +/* audit watch/mark/tree functions */ #ifdef CONFIG_AUDITSYSCALL +extern unsigned int audit_serial(void); +extern int auditsc_get_stamp(struct audit_context *ctx, + struct timespec64 *t, unsigned int *serial); + extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); -extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op); extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, + dev_t dev); -extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, + char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, + unsigned long ino, dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); -extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); +extern int audit_exe_compare(struct task_struct *tsk, + struct audit_fsnotify_mark *mark); + +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, + struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); +extern void audit_trim_trees(void); +extern int audit_tag_tree(char *old, char *new); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct audit_context *context); -#else +extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx); +extern struct list_head *audit_killed_trees(void); +#else /* CONFIG_AUDITSYSCALL */ +#define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} #define audit_get_watch(w) {} #define audit_to_watch(k, p, l, o) (-EINVAL) @@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDITSYSCALL -extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); -extern void audit_put_chunk(struct audit_chunk *chunk); -extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); -extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); -extern int audit_add_tree_rule(struct audit_krule *rule); -extern int audit_remove_tree_rule(struct audit_krule *rule); -extern void audit_trim_trees(void); -extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *tree); -extern void audit_put_tree(struct audit_tree *tree); -extern void audit_kill_trees(struct list_head *list); -#else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL #define audit_make_tree(rule, str, op) -EINVAL @@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ -#define audit_kill_trees(list) BUG() -#endif +#define audit_kill_trees(context) BUG() + +#define audit_signal_info(s, t) AUDIT_DISABLED +#define audit_filter_inodes(t, c) AUDIT_DISABLED +#endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); @@ -334,14 +342,5 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); -#ifdef CONFIG_AUDITSYSCALL -extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); -extern struct list_head *audit_killed_trees(void); -#else -#define audit_signal_info(s,t) AUDIT_DISABLED -#define audit_filter_inodes(t,c) AUDIT_DISABLED -#endif - extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index cf4512a33675..37ae95cfb7f4 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_session_info(ab); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d4af4d97f847..abfb112f26aa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_tree_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_context *context, + struct audit_krule *rule) { struct audit_buffer *ab; if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=remove_rule dir="); @@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) audit_log_end(ab); } -static void kill_rules(struct audit_tree *tree) +static void kill_rules(struct audit_context *context, struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; @@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_tree_log_remove_rule(rule); + audit_tree_log_remove_rule(context, rule); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); rule->tree = NULL; @@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree) tree->goner = 1; spin_unlock(&hash_lock); mutex_lock(&audit_filter_mutex); - kill_rules(tree); + kill_rules(audit_context(), tree); list_del_init(&tree->list); mutex_unlock(&audit_filter_mutex); prune_one(tree); @@ -973,8 +974,10 @@ static void audit_schedule_prune(void) * ... and that one is done if evict_chunk() decides to delay until the end * of syscall. Runs synchronously. */ -void audit_kill_trees(struct list_head *list) +void audit_kill_trees(struct audit_context *context) { + struct list_head *list = &context->killed_trees; + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); @@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list) struct audit_tree *victim; victim = list_entry(list->next, struct audit_tree, list); - kill_rules(victim); + kill_rules(context, victim); list_del_init(&victim->list); mutex_unlock(&audit_filter_mutex); @@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk) list_del_init(&owner->same_root); spin_unlock(&hash_lock); if (!postponed) { - kill_rules(owner); + kill_rules(audit_context(), owner); list_move(&owner->list, &prune_list); need_prune = 1; } else { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20ef9ba134b0..e8d1adeb2223 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bf309f2592c4..63f8b3f26fab 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fallthrough if set */ + /* fall through - if set */ default: data->values[i] = f->val; } @@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); @@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype) if (f->lsm_rule) { security_task_getsecid(current, &sid); result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule, NULL); + f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6593a5207fb0..d1eab1d4a930 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk, need_sid = 0; } result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); + f->op, + f->lsm_rule); } break; case AUDIT_OBJ_USER: @@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); + name->osid, + f->type, + f->op, + f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { + if (security_audit_rule_match( + n->osid, + f->type, + f->op, + f->lsm_rule)) { ++result; break; } @@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; if (security_audit_rule_match(ctx->ipc.osid, f->type, f->op, - f->lsm_rule, ctx)) + f->lsm_rule)) ++result; } break; @@ -1136,6 +1139,32 @@ out: kfree(buf_head); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; + } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + if (name->fcap_ver == -1) { + audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); + return; + } + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", + name->fcap.fE, name->fcap_ver, + from_kuid(&init_user_ns, name->fcap.rootid)); +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } +/* + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +static void audit_log_name(struct audit_context *context, struct audit_names *n, + const struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd + */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != AUDIT_INO_UNSET) + audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + /* log the audit_names record type */ + switch (n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, " nametype=NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, " nametype=PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, " nametype=DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, " nametype=CREATE"); + break; + default: + audit_log_format(ab, " nametype=UNKNOWN"); + break; + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + static void audit_log_proctitle(void) { int res; @@ -1358,6 +1478,9 @@ static void audit_log_exit(void) audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); + audit_log_format(ab, " frootid=%d", + from_kuid(&init_user_ns, + axs->fcap.rootid)); break; } } @@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't doesn't have any meaningful data we @@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk) audit_log_exit(); } - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + if (!context->dummy && context->in_syscall) { if (success) context->return_valid = AUDITSC_SUCCESS; @@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); @@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name) get_fs_pwd(current->fs, &context->pwd); } +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap.rootid = caps.rootid; + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + struct inode *inode, unsigned int flags) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + if (flags & AUDIT_INODE_NOEVAL) { + name->fcap_ver = -1; + return; + } + audit_copy_fcaps(name, dentry); +} + /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + rcu_read_unlock(); + if (!name) goto out_alloc; @@ -1832,7 +2017,7 @@ out: n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); - audit_copy_inode(n, dentry, inode); + audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) @@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent, for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_FSTYPE) { - if (audit_comparator(parent->i_sb->s_magic, - f->op, f->val)) { - if (e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } - } + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent, n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; - audit_copy_inode(n, NULL, parent); + audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { @@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent, } if (inode) - audit_copy_inode(found_child, dentry, inode); + audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } @@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx, return 1; } -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -static int audit_set_loginuid_perm(kuid_t loginuid) -{ - /* if we are unset, we don't need privs */ - if (!audit_loginuid_set(current)) - return 0; - /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ - if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) - return -EPERM; - /* it is set, you need permission */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - /* reject if this is not an unset and we don't allow that */ - if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) - return -EPERM; - return 0; -} - -static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, - unsigned int oldsessionid, unsigned int sessionid, - int rc) -{ - struct audit_buffer *ab; - uid_t uid, oldloginuid, loginuid; - struct tty_struct *tty; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; - - uid = from_kuid(&init_user_ns, task_uid(current)); - oldloginuid = from_kuid(&init_user_ns, koldloginuid); - loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(); - - audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); - audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", - oldsessionid, sessionid, !rc); - audit_put_tty(tty); - audit_log_end(ab); -} - -/** - * audit_set_loginuid - set current task's audit_context loginuid - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(kuid_t loginuid) -{ - unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; - kuid_t oldloginuid; - int rc; - - oldloginuid = audit_get_loginuid(current); - oldsessionid = audit_get_sessionid(current); - - rc = audit_set_loginuid_perm(loginuid); - if (rc) - goto out; - - /* are we setting or clearing? */ - if (uid_valid(loginuid)) { - sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == AUDIT_SID_UNSET)) - sessionid = (unsigned int)atomic_inc_return(&session_id); - } - - current->sessionid = sessionid; - current->loginuid = loginuid; -out: - audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); - return rc; -} - /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag @@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..1444f3954d75 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ + /* fall through - v3 is otherwise equivalent to v2. */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; @@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); @@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); @@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +static bool ns_capable_common(struct user_namespace *ns, + int cap, + unsigned int opts) { int capable; @@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) BUG(); } - capable = audit ? security_capable(current_cred(), ns, cap) : - security_capable_noaudit(current_cred(), ns, cap); + capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; @@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) */ bool ns_capable(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, true); + return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); @@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable); */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, false); + return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** + * ns_capable_setid - Determine if the current task has a superior capability + * in effect, while signalling that this check is being done from within a + * setid syscall. + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_setid(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, CAP_OPT_INSETID); +} +EXPORT_SYMBOL(ns_capable_setid); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable); bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { + if (WARN_ON_ONCE(!cap_valid(cap))) return false; - if (security_capable(file->f_cred, ns, cap) == 0) + if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; @@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; + rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) - ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, + CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cef98502b124..eef24a25bda7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -5314,7 +5326,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5750,16 +5762,19 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); +} +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..72afd55f70c6 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -203,19 +203,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..c9960baaa14f 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..1d75ae7f1cb7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d503d1a9007c..bb95a35e8c2d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; if (pos == root) return NULL; @@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); - if (parent && rstatc->updated_next) { + if (rstatc->updated_next) { + struct cgroup *parent = cgroup_parent(pos); struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; @@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * updated stat. */ smp_mb(); + + return pos; } - return pos; + /* only happens for @root */ + return NULL; } /* see cgroup_rstat_flush() */ diff --git a/kernel/configs.c b/kernel/configs.c index 2df132b20217..b062425ccf8d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -30,37 +30,35 @@ #include <linux/init.h> #include <linux/uaccess.h> -/**************************************************/ -/* the actual current config file */ - /* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * <image> - * IKCFG_ED + * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from + * a binary kernel image or a module. See scripts/extract-ikconfig. */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) +asm ( +" .pushsection .rodata, \"a\" \n" +" .ascii \"IKCFG_ST\" \n" +" .global kernel_config_data \n" +"kernel_config_data: \n" +" .incbin \"kernel/config_data.gz\" \n" +" .global kernel_config_data_end \n" +"kernel_config_data_end: \n" +" .ascii \"IKCFG_ED\" \n" +" .popsection \n" +); #ifdef CONFIG_IKCONFIG_PROC +extern char kernel_config_data; +extern char kernel_config_data_end; + static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); + &kernel_config_data, + &kernel_config_data_end - + &kernel_config_data); } static const struct file_operations ikconfig_file_ops = { @@ -79,7 +77,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - proc_set_size(entry, kernel_config_data_size); + proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/cred.c b/kernel/cred.c index 21f4a97085b4..45d77284aed0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; -#ifdef CONFIG_SECURITY_SELINUX - /* - * cred->security == NULL if security_cred_alloc_blank() or - * security_prepare_creds() returned an error. - */ - if (selinux_is_enabled() && cred->security) { - if ((unsigned long) cred->security < PAGE_SIZE) - return true; - if ((*(u32 *)cred->security & 0xffffff00) == - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) - return true; - } -#endif return false; } EXPORT_SYMBOL(creds_are_invalid); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index ca88b867e7fe..a06ba3013b3b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -16,7 +16,16 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool -config HAVE_GENERIC_DMA_COHERENT +config ARCH_HAS_DMA_SET_MASK + bool + +config DMA_DECLARE_COHERENT + bool + +config ARCH_HAS_SETUP_DMA_OPS + bool + +config ARCH_HAS_TEARDOWN_DMA_OPS bool config ARCH_HAS_SYNC_DMA_FOR_DEVICE @@ -53,3 +62,116 @@ config DMA_REMAP config DMA_DIRECT_REMAP bool select DMA_REMAP + +config DMA_CMA + bool "DMA Contiguous Memory Allocator" + depends on HAVE_DMA_CONTIGUOUS && CMA + help + This enables the Contiguous Memory Allocator which allows drivers + to allocate big physically-contiguous blocks of memory for use with + hardware components that do not support I/O map nor scatter-gather. + + You can disable CMA by specifying "cma=0" on the kernel's command + line. + + For more information see <include/linux/dma-contiguous.h>. + If unsure, say "n". + +if DMA_CMA +comment "Default contiguous memory area size:" + +config CMA_SIZE_MBYTES + int "Size in Mega Bytes" + depends on !CMA_SIZE_SEL_PERCENTAGE + default 0 if X86 + default 16 + help + Defines the size (in MiB) of the default memory area for Contiguous + Memory Allocator. If the size of 0 is selected, CMA is disabled by + default, but it can be enabled by passing cma=size[MG] to the kernel. + + +config CMA_SIZE_PERCENTAGE + int "Percentage of total memory" + depends on !CMA_SIZE_SEL_MBYTES + default 0 if X86 + default 10 + help + Defines the size of the default memory area for Contiguous Memory + Allocator as a percentage of the total memory in the system. + If 0 percent is selected, CMA is disabled by default, but it can be + enabled by passing cma=size[MG] to the kernel. + +choice + prompt "Selected region size" + default CMA_SIZE_SEL_MBYTES + +config CMA_SIZE_SEL_MBYTES + bool "Use mega bytes value only" + +config CMA_SIZE_SEL_PERCENTAGE + bool "Use percentage value only" + +config CMA_SIZE_SEL_MIN + bool "Use lower value (minimum)" + +config CMA_SIZE_SEL_MAX + bool "Use higher value (maximum)" + +endchoice + +config CMA_ALIGNMENT + int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 4 12 + default 8 + help + DMA mapping framework by default aligns all buffers to the smallest + PAGE_SIZE order which is greater than or equal to the requested buffer + size. This works well for buffers up to a few hundreds kilobytes, but + for larger buffers it just a memory waste. With this parameter you can + specify the maximum PAGE_SIZE order for contiguous buffers. Larger + buffers will be aligned only to this specified order. The order is + expressed as a power of two multiplied by the PAGE_SIZE. + + For example, if your system defaults to 4KiB pages, the order value + of 8 means that the buffers will be aligned up to 1MiB only. + + If unsure, leave the default value "8". + +endif + +config DMA_API_DEBUG + bool "Enable debugging of DMA-API usage" + select NEED_DMA_MAP_STATE + help + Enable this option to debug the use of the DMA API by device drivers. + With this option you will be able to detect common bugs in device + drivers like double-freeing of DMA mappings or freeing mappings that + were never allocated. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. + +config DMA_API_DEBUG_SG + bool "Debug DMA scatter-gather usage" + default y + depends on DMA_API_DEBUG + help + Perform extra checking that callers of dma_map_sg() have respected the + appropriate segment length/boundary limits for the given device when + preparing DMA scatterlists. + + This is particularly likely to have been overlooked in cases where the + dma_map_sg() API is used for general bulk mapping of pages rather than + preparing literal scatter-gather descriptors, where there is a risk of + unexpected behaviour from DMA API implementations if the scatterlist + is technically out-of-spec. + + If unsure, say N. diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 72ff6e46aa86..d237cf3dc181 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 66f0fb7e9a3a..29fd6590dc1e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -14,7 +14,6 @@ struct dma_coherent_mem { dma_addr_t device_base; unsigned long pfn_base; int size; - int flags; unsigned long *bitmap; spinlock_t spinlock; bool use_dev_dma_pfn_offset; @@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) +static int dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, + struct dma_coherent_mem **mem) { struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; + void *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); int ret; @@ -73,7 +72,6 @@ static int dma_init_coherent_memory( dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; - dma_mem->flags = flags; spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; @@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev, } int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) + dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); if (ret) return ret; @@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev) } EXPORT_SYMBOL(dma_release_declared_memory); -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { @@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 0; *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; + return 1; } void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) @@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) if (!mem) { ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); + rmem->size, &mem); if (ret) { pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 23cf5361bcf1..45d51e8e26f6 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -134,17 +134,6 @@ static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *nr_total_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - /* per-driver filter related state */ #define NAME_MAX_LEN 64 @@ -840,66 +829,46 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dma_debug_fs_init(void) +static int dump_show(struct seq_file *seq, void *v) { - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("can not create debugfs directory\n"); - return -ENOMEM; - } + int idx; - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444, - dma_debug_dent, - &nr_total_entries); - if (!nr_total_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", + dev_name(entry->dev), + dev_driver_string(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; +static void dma_debug_fs_init(void) +{ + struct dentry *dentry = debugfs_create_dir("dma-api", NULL); + + debugfs_create_bool("disabled", 0444, dentry, &global_disable); + debugfs_create_u32("error_count", 0444, dentry, &error_count); + debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors); + debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors); + debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries); + debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries); + debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); + debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); + debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); } static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) @@ -985,12 +954,7 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - if (dma_debug_fs_init() != 0) { - pr_err("error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } + dma_debug_fs_init(); nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 355d16acee6d..fcdb23e8d2fc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -132,8 +132,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -356,6 +355,20 @@ out_unmap: } EXPORT_SYMBOL(dma_direct_map_sg); +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t dma_addr = paddr; + + if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_direct_map_resource); + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by @@ -380,3 +393,14 @@ int dma_direct_supported(struct device *dev, u64 mask) */ return mask >= __phys_to_dma(dev, min_mask); } + +size_t dma_direct_max_mapping_size(struct device *dev) +{ + size_t size = SIZE_MAX; + + /* If SWIOTLB is active, use its maximum mapping size */ + if (is_swiotlb_active()) + size = swiotlb_max_mapping_size(dev); + + return size; +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a11006b6d8e8..c000906348c9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) @@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) @@ -357,3 +360,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, ops->cache_sync(dev, vaddr, size, dir); } EXPORT_SYMBOL(dma_cache_sync); + +size_t dma_max_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (dma_is_direct(ops)) + size = dma_direct_max_mapping_size(dev); + else if (ops && ops->max_mapping_size) + size = ops->max_mapping_size(dev); + + return size; +} +EXPORT_SYMBOL_GPL(dma_max_mapping_size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1fb6fd68b9c7..9f5851064aea 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -34,6 +34,9 @@ #include <linux/scatterlist.h> #include <linux/mem_encrypt.h> #include <linux/set_memory.h> +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#endif #include <asm/io.h> #include <asm/dma.h> @@ -73,6 +76,11 @@ phys_addr_t io_tlb_start, io_tlb_end; static unsigned long io_tlb_nslabs; /* + * The number of used IO TLB block + */ +static unsigned long io_tlb_used; + +/* * This is a free list describing the number of free entries available from * each index */ @@ -385,7 +393,7 @@ void __init swiotlb_exit(void) } /* - * Bounce: copy the swiotlb buffer back to the original dma location + * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) @@ -475,6 +483,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, * request and allocate a buffer from that IO TLB pool. */ spin_lock_irqsave(&io_tlb_lock, flags); + + if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + goto not_found; + index = ALIGN(io_tlb_index, stride); if (index >= io_tlb_nslabs) index = 0; @@ -524,6 +536,7 @@ not_found: dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); return DMA_MAPPING_ERROR; found: + io_tlb_used += nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); /* @@ -584,6 +597,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, */ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) io_tlb_list[i] = ++count; + + io_tlb_used -= nslots; } spin_unlock_irqrestore(&io_tlb_lock, flags); } @@ -651,14 +666,49 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) +size_t swiotlb_max_mapping_size(struct device *dev) { - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; } + +bool is_swiotlb_active(void) +{ + /* + * When SWIOTLB is initialized, even if io_tlb_start points to physical + * address zero, io_tlb_end surely doesn't. + */ + return io_tlb_end != 0; +} + +#ifdef CONFIG_DEBUG_FS + +static int __init swiotlb_create_debugfs(void) +{ + struct dentry *d_swiotlb_usage; + struct dentry *ent; + + d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); + + if (!d_swiotlb_usage) + return -ENOMEM; + + ent = debugfs_create_ulong("io_tlb_nslabs", 0400, + d_swiotlb_usage, &io_tlb_nslabs); + if (!ent) + goto fail; + + ent = debugfs_create_ulong("io_tlb_used", 0400, + d_swiotlb_usage, &io_tlb_used); + if (!ent) + goto fail; + + return 0; + +fail: + debugfs_remove_recursive(d_swiotlb_usage); + return -ENOMEM; +} + +late_initcall(swiotlb_create_debugfs); + +#endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 5f59d848171e..f7086c388781 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5473,7 +5473,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); @@ -5546,7 +5546,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5694,7 +5694,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5735,7 +5735,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { diff --git a/kernel/exit.c b/kernel/exit.c index 2639a30a8aa5..2166c2d92ddc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -219,6 +219,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); diff --git a/kernel/fork.c b/kernel/fork.c index 77059b211608..9dcd18aa210b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,7 +77,6 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -981,7 +980,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 1e32e66c9563..2dddecbdbe6e 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) /* Duplicate gcov_info. */ active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); if (!dup) return NULL; dup->version = info->version; @@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), + iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), GFP_KERNEL); if (iter) iter->info = info; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 4a9191617076..f108a95882c6 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -19,6 +19,7 @@ #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include <trace/events/sched.h> @@ -126,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); + t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 99b7dd6982a4..3faef4a77f71 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1340,6 +1340,17 @@ void irq_chip_mask_parent(struct irq_data *data) EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** + * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask_ack(data); +} +EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent); + +/** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ @@ -1443,6 +1454,7 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); #endif /** diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 98a20e1594ce..b992f88c5613 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data) irq_ctx->enabled = true; } +static int irq_sim_set_type(struct irq_data *data, unsigned int type) +{ + /* We only support rising and falling edge trigger types. */ + if (type & ~IRQ_TYPE_EDGE_BOTH) + return -EINVAL; + + irqd_set_trigger_type(data, type); + + return 0; +} + static struct irq_chip irq_sim_irqchip = { .name = "irq_sim", .irq_mask = irq_sim_irqmask, .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, }; static void irq_sim_handle_irq(struct irq_work *work) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3bf9793d8825..9ed29e4a7dbf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -743,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, +static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; - fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; - fwspec->param_count = irq_data->args_count; + fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->param_count = count; - for (i = 0; i < irq_data->args_count; i++) - fwspec->param[i] = irq_data->args[i]; + for (i = 0; i < count; i++) + fwspec->param[i] = args[i]; } unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) @@ -850,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; - of_phandle_args_to_fwspec(irq_data, &fwspec); + of_phandle_args_to_fwspec(irq_data->np, irq_data->args, + irq_data->args_count, &fwspec); + return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); @@ -942,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { - if (WARN_ON(intsize < 2)) - return -EINVAL; - *out_hwirq = intspec[0]; - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; - return 0; + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); @@ -982,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +/** + * irq_domain_translate_twocell() - Generic translate for direct two cell + * bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_translate_twocell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { diff --git a/kernel/kcov.c b/kernel/kcov.c index c2277dbdbfb1..2ee38727844a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <linux/refcount.h> #include <asm/setup.h> /* Number of 64-bit words written per one comparison: */ @@ -44,7 +45,7 @@ struct kcov { * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) */ - atomic_t refcount; + refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; @@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch); static void kcov_get(struct kcov *kcov) { - atomic_inc(&kcov->refcount); + refcount_inc(&kcov->refcount); } static void kcov_put(struct kcov *kcov) { - if (atomic_dec_and_test(&kcov->refcount)) { + if (refcount_dec_and_test(&kcov->refcount)) { vfree(kcov->area); kfree(kcov); } @@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; - atomic_set(&kcov->refcount, 1); + refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); @@ -444,10 +445,8 @@ static int __init kcov_init(void) * there is no need to protect it against removal races. The * use of debugfs_create_file_unsafe() is actually safe here. */ - if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { - pr_err("failed to create kcov in debugfs\n"); - return -ENOMEM; - } + debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); + return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 9cf20cc5ebe3..5942eeafb9ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5b77a7314e01..eb0ee10a1981 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -45,7 +45,12 @@ */ DEFINE_MUTEX(klp_mutex); -static LIST_HEAD(klp_patches); +/* + * Actively used patches: enabled or in transition. Note that replaced + * or disabled patches are not listed even though the related kernel + * module still can be loaded. + */ +LIST_HEAD(klp_patches); static struct kobject *klp_root_kobj; @@ -82,20 +87,43 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -static bool klp_is_patch_registered(struct klp_patch *patch) +static bool klp_initialized(void) +{ + return !!klp_root_kobj; +} + +static struct klp_func *klp_find_func(struct klp_object *obj, + struct klp_func *old_func) { - struct klp_patch *mypatch; + struct klp_func *func; - list_for_each_entry(mypatch, &klp_patches, list) - if (mypatch == patch) - return true; + klp_for_each_func(obj, func) { + if ((strcmp(old_func->old_name, func->old_name) == 0) && + (old_func->old_sympos == func->old_sympos)) { + return func; + } + } - return false; + return NULL; } -static bool klp_initialized(void) +static struct klp_object *klp_find_object(struct klp_patch *patch, + struct klp_object *old_obj) { - return !!klp_root_kobj; + struct klp_object *obj; + + klp_for_each_object(patch, obj) { + if (klp_is_module(old_obj)) { + if (klp_is_module(obj) && + strcmp(old_obj->name, obj->name) == 0) { + return obj; + } + } else if (!klp_is_module(obj)) { + return obj; + } + } + + return NULL; } struct klp_find_arg { @@ -278,170 +306,6 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static int __klp_disable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - - if (WARN_ON(!patch->enabled)) - return -EINVAL; - - if (klp_transition_patch) - return -EBUSY; - - /* enforce stacking: only the last enabled patch can be disabled */ - if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->enabled) - return -EBUSY; - - klp_init_transition(patch, KLP_UNPATCHED); - - klp_for_each_object(patch, obj) - if (obj->patched) - klp_pre_unpatch_callback(obj); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the TIF_PATCH_PENDING writes in - * klp_start_transition(). In the rare case where klp_ftrace_handler() - * is called shortly after klp_update_patch_state() switches the task, - * this ensures the handler sees that func->transition is set. - */ - smp_wmb(); - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = false; - - return 0; -} - -/** - * klp_disable_patch() - disables a registered patch - * @patch: The registered, enabled patch to be disabled - * - * Unregisters the patched functions from ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_disable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - if (!patch->enabled) { - ret = -EINVAL; - goto err; - } - - ret = __klp_disable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_disable_patch); - -static int __klp_enable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - int ret; - - if (klp_transition_patch) - return -EBUSY; - - if (WARN_ON(patch->enabled)) - return -EINVAL; - - /* enforce stacking: only the first disabled patch can be enabled */ - if (patch->list.prev != &klp_patches && - !list_prev_entry(patch, list)->enabled) - return -EBUSY; - - /* - * A reference is taken on the patch module to prevent it from being - * unloaded. - */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - pr_notice("enabling patch '%s'\n", patch->mod->name); - - klp_init_transition(patch, KLP_PATCHED); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the ops->func_stack writes in - * klp_patch_object(), so that klp_ftrace_handler() will see the - * func->transition updates before the handler is registered and the - * new funcs become visible to the handler. - */ - smp_wmb(); - - klp_for_each_object(patch, obj) { - if (!klp_is_object_loaded(obj)) - continue; - - ret = klp_pre_patch_callback(obj); - if (ret) { - pr_warn("pre-patch callback failed for object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - - ret = klp_patch_object(obj); - if (ret) { - pr_warn("failed to patch object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - } - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = true; - - return 0; -err: - pr_warn("failed to enable patch '%s'\n", patch->mod->name); - - klp_cancel_transition(); - return ret; -} - -/** - * klp_enable_patch() - enables a registered patch - * @patch: The registered, disabled patch to be enabled - * - * Performs the needed symbol lookups and code relocations, - * then registers the patched functions with ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_enable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - ret = __klp_enable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_enable_patch); - /* * Sysfs Interface * @@ -449,11 +313,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/transition - * /sys/kernel/livepatch/<patch>/signal * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ +static int __klp_disable_patch(struct klp_patch *patch); static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -470,40 +334,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&klp_mutex); - if (!klp_is_patch_registered(patch)) { - /* - * Module with the patch could either disappear meanwhile or is - * not properly initialized yet. - */ - ret = -EINVAL; - goto err; - } - if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; - goto err; + goto out; } - if (patch == klp_transition_patch) { + /* + * Allow to reverse a pending transition in both ways. It might be + * necessary to complete the transition without forcing and breaking + * the system integrity. + * + * Do not allow to re-enable a disabled patch. + */ + if (patch == klp_transition_patch) klp_reverse_transition(); - } else if (enabled) { - ret = __klp_enable_patch(patch); - if (ret) - goto err; - } else { + else if (!enabled) ret = __klp_disable_patch(patch); - if (ret) - goto err; - } + else + ret = -EINVAL; +out: mutex_unlock(&klp_mutex); + if (ret) + return ret; return count; - -err: - mutex_unlock(&klp_mutex); - return ret; } static ssize_t enabled_show(struct kobject *kobj, @@ -525,35 +381,6 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } -static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct klp_patch *patch; - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret) - return ret; - - if (!val) - return count; - - mutex_lock(&klp_mutex); - - patch = container_of(kobj, struct klp_patch, kobj); - if (patch != klp_transition_patch) { - mutex_unlock(&klp_mutex); - return -EINVAL; - } - - klp_send_signals(); - - mutex_unlock(&klp_mutex); - - return count; -} - static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -585,16 +412,129 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); -static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, - &signal_kobj_attr.attr, &force_kobj_attr.attr, NULL }; +static void klp_free_object_dynamic(struct klp_object *obj) +{ + kfree(obj->name); + kfree(obj); +} + +static struct klp_object *klp_alloc_object_dynamic(const char *name) +{ + struct klp_object *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return NULL; + + if (name) { + obj->name = kstrdup(name, GFP_KERNEL); + if (!obj->name) { + kfree(obj); + return NULL; + } + } + + INIT_LIST_HEAD(&obj->func_list); + obj->dynamic = true; + + return obj; +} + +static void klp_free_func_nop(struct klp_func *func) +{ + kfree(func->old_name); + kfree(func); +} + +static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, + struct klp_object *obj) +{ + struct klp_func *func; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return NULL; + + if (old_func->old_name) { + func->old_name = kstrdup(old_func->old_name, GFP_KERNEL); + if (!func->old_name) { + kfree(func); + return NULL; + } + } + + /* + * func->new_func is same as func->old_func. These addresses are + * set when the object is loaded, see klp_init_object_loaded(). + */ + func->old_sympos = old_func->old_sympos; + func->nop = true; + + return func; +} + +static int klp_add_object_nops(struct klp_patch *patch, + struct klp_object *old_obj) +{ + struct klp_object *obj; + struct klp_func *func, *old_func; + + obj = klp_find_object(patch, old_obj); + + if (!obj) { + obj = klp_alloc_object_dynamic(old_obj->name); + if (!obj) + return -ENOMEM; + + list_add_tail(&obj->node, &patch->obj_list); + } + + klp_for_each_func(old_obj, old_func) { + func = klp_find_func(obj, old_func); + if (func) + continue; + + func = klp_alloc_func_nop(old_func, obj); + if (!func) + return -ENOMEM; + + list_add_tail(&func->node, &obj->func_list); + } + + return 0; +} + +/* + * Add 'nop' functions which simply return to the caller to run + * the original function. The 'nop' functions are added to a + * patch to facilitate a 'replace' mode. + */ +static int klp_add_nops(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_object *old_obj; + + klp_for_each_patch(old_patch) { + klp_for_each_object(old_patch, old_obj) { + int err; + + err = klp_add_object_nops(patch, old_obj); + if (err) + return err; + } + } + + return 0; +} + static void klp_kobj_release_patch(struct kobject *kobj) { struct klp_patch *patch; @@ -611,6 +551,12 @@ static struct kobj_type klp_ktype_patch = { static void klp_kobj_release_object(struct kobject *kobj) { + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + + if (obj->dynamic) + klp_free_object_dynamic(obj); } static struct kobj_type klp_ktype_object = { @@ -620,6 +566,12 @@ static struct kobj_type klp_ktype_object = { static void klp_kobj_release_func(struct kobject *kobj) { + struct klp_func *func; + + func = container_of(kobj, struct klp_func, kobj); + + if (func->nop) + klp_free_func_nop(func); } static struct kobj_type klp_ktype_func = { @@ -627,17 +579,23 @@ static struct kobj_type klp_ktype_func = { .sysfs_ops = &kobj_sysfs_ops, }; -/* - * Free all functions' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_funcs_limited(struct klp_object *obj, - struct klp_func *limit) +static void __klp_free_funcs(struct klp_object *obj, bool nops_only) { - struct klp_func *func; + struct klp_func *func, *tmp_func; + + klp_for_each_func_safe(obj, func, tmp_func) { + if (nops_only && !func->nop) + continue; + + list_del(&func->node); - for (func = obj->funcs; func->old_name && func != limit; func++) - kobject_put(&func->kobj); + /* Might be called from klp_init_patch() error path. */ + if (func->kobj_added) { + kobject_put(&func->kobj); + } else if (func->nop) { + klp_free_func_nop(func); + } + } } /* Clean up when a patched object is unloaded */ @@ -647,35 +605,111 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - klp_for_each_func(obj, func) - func->old_addr = 0; + klp_for_each_func(obj, func) { + func->old_func = NULL; + + if (func->nop) + func->new_func = NULL; + } } -/* - * Free all objects' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_objects_limited(struct klp_patch *patch, - struct klp_object *limit) +static void __klp_free_objects(struct klp_patch *patch, bool nops_only) { - struct klp_object *obj; + struct klp_object *obj, *tmp_obj; + + klp_for_each_object_safe(patch, obj, tmp_obj) { + __klp_free_funcs(obj, nops_only); + + if (nops_only && !obj->dynamic) + continue; + + list_del(&obj->node); - for (obj = patch->objs; obj->funcs && obj != limit; obj++) { - klp_free_funcs_limited(obj, NULL); - kobject_put(&obj->kobj); + /* Might be called from klp_init_patch() error path. */ + if (obj->kobj_added) { + kobject_put(&obj->kobj); + } else if (obj->dynamic) { + klp_free_object_dynamic(obj); + } } } -static void klp_free_patch(struct klp_patch *patch) +static void klp_free_objects(struct klp_patch *patch) +{ + __klp_free_objects(patch, false); +} + +static void klp_free_objects_dynamic(struct klp_patch *patch) +{ + __klp_free_objects(patch, true); +} + +/* + * This function implements the free operations that can be called safely + * under klp_mutex. + * + * The operation must be completed by calling klp_free_patch_finish() + * outside klp_mutex. + */ +void klp_free_patch_start(struct klp_patch *patch) { - klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); + + klp_free_objects(patch); +} + +/* + * This function implements the free part that must be called outside + * klp_mutex. + * + * It must be called after klp_free_patch_start(). And it has to be + * the last function accessing the livepatch structures when the patch + * gets disabled. + */ +static void klp_free_patch_finish(struct klp_patch *patch) +{ + /* + * Avoid deadlock with enabled_store() sysfs callback by + * calling this outside klp_mutex. It is safe because + * this is called when the patch gets disabled and it + * cannot get enabled again. + */ + if (patch->kobj_added) { + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + } + + /* Put the module after the last access to struct klp_patch. */ + if (!patch->forced) + module_put(patch->mod); +} + +/* + * The livepatch might be freed from sysfs interface created by the patch. + * This work allows to wait until the interface is destroyed in a separate + * context. + */ +static void klp_free_patch_work_fn(struct work_struct *work) +{ + struct klp_patch *patch = + container_of(work, struct klp_patch, free_work); + + klp_free_patch_finish(patch); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - if (!func->old_name || !func->new_func) + int ret; + + if (!func->old_name) + return -EINVAL; + + /* + * NOPs get the address later. The patched module must be loaded, + * see klp_init_object_loaded(). + */ + if (!func->new_func && !func->nop) return -EINVAL; if (strlen(func->old_name) >= KSYM_NAME_LEN) @@ -690,9 +724,13 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); + ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); + if (!ret) + func->kobj_added = true; + + return ret; } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -721,11 +759,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, - &func->old_addr); + (unsigned long *)&func->old_func); if (ret) return ret; - ret = kallsyms_lookup_size_offset(func->old_addr, + ret = kallsyms_lookup_size_offset((unsigned long)func->old_func, &func->old_size, NULL); if (!ret) { pr_err("kallsyms size lookup failed for '%s'\n", @@ -733,6 +771,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, return -ENOENT; } + if (func->nop) + func->new_func = func->old_func; + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, &func->new_size, NULL); if (!ret) { @@ -751,9 +792,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) int ret; const char *name; - if (!obj->funcs) - return -EINVAL; - if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; @@ -767,122 +805,191 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) &patch->kobj, "%s", name); if (ret) return ret; + obj->kobj_added = true; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) - goto free; + return ret; } - if (klp_is_object_loaded(obj)) { + if (klp_is_object_loaded(obj)) ret = klp_init_object_loaded(patch, obj); - if (ret) - goto free; - } - return 0; - -free: - klp_free_funcs_limited(obj, func); - kobject_put(&obj->kobj); return ret; } -static int klp_init_patch(struct klp_patch *patch) +static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; - int ret; + struct klp_func *func; if (!patch->objs) return -EINVAL; - mutex_lock(&klp_mutex); - + INIT_LIST_HEAD(&patch->list); + INIT_LIST_HEAD(&patch->obj_list); + patch->kobj_added = false; patch->enabled = false; + patch->forced = false; + INIT_WORK(&patch->free_work, klp_free_patch_work_fn); init_completion(&patch->finish); + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + + INIT_LIST_HEAD(&obj->func_list); + obj->kobj_added = false; + list_add_tail(&obj->node, &patch->obj_list); + + klp_for_each_func_static(obj, func) { + func->kobj_added = false; + list_add_tail(&func->node, &obj->func_list); + } + } + + if (!try_module_get(patch->mod)) + return -ENODEV; + + return 0; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, klp_root_kobj, "%s", patch->mod->name); - if (ret) { - mutex_unlock(&klp_mutex); + if (ret) return ret; + patch->kobj_added = true; + + if (patch->replace) { + ret = klp_add_nops(patch); + if (ret) + return ret; } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) - goto free; + return ret; } list_add_tail(&patch->list, &klp_patches); - mutex_unlock(&klp_mutex); - return 0; +} -free: - klp_free_objects_limited(patch, obj); +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; - mutex_unlock(&klp_mutex); + if (WARN_ON(!patch->enabled)) + return -EINVAL; - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + if (klp_transition_patch) + return -EBUSY; - return ret; + klp_init_transition(patch, KLP_UNPATCHED); + + klp_for_each_object(patch, obj) + if (obj->patched) + klp_pre_unpatch_callback(obj); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); + + klp_start_transition(); + patch->enabled = false; + klp_try_complete_transition(); + + return 0; } -/** - * klp_unregister_patch() - unregisters a patch - * @patch: Disabled patch to be unregistered - * - * Frees the data structures and removes the sysfs interface. - * - * Return: 0 on success, otherwise error - */ -int klp_unregister_patch(struct klp_patch *patch) +static int __klp_enable_patch(struct klp_patch *patch) { + struct klp_object *obj; int ret; - mutex_lock(&klp_mutex); + if (klp_transition_patch) + return -EBUSY; - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } + if (WARN_ON(patch->enabled)) + return -EINVAL; - if (patch->enabled) { - ret = -EBUSY; - goto err; - } + if (!patch->kobj_added) + return -EINVAL; - klp_free_patch(patch); + pr_notice("enabling patch '%s'\n", patch->mod->name); - mutex_unlock(&klp_mutex); + klp_init_transition(patch, KLP_PATCHED); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); + + klp_for_each_object(patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + } - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + klp_start_transition(); + patch->enabled = true; + klp_try_complete_transition(); return 0; err: - mutex_unlock(&klp_mutex); + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); return ret; } -EXPORT_SYMBOL_GPL(klp_unregister_patch); /** - * klp_register_patch() - registers a patch - * @patch: Patch to be registered + * klp_enable_patch() - enable the livepatch + * @patch: patch to be enabled * - * Initializes the data structure associated with the patch and - * creates the sysfs interface. + * Initializes the data structure associated with the patch, creates the sysfs + * interface, performs the needed symbol lookups and code relocations, + * registers the patched functions with ftrace. * - * There is no need to take the reference on the patch module here. It is done - * later when the patch is enabled. + * This function is supposed to be called from the livepatch module_init() + * callback. * * Return: 0 on success, otherwise error */ -int klp_register_patch(struct klp_patch *patch) +int klp_enable_patch(struct klp_patch *patch) { + int ret; + if (!patch || !patch->mod) return -EINVAL; @@ -897,12 +1004,91 @@ int klp_register_patch(struct klp_patch *patch) if (!klp_have_reliable_stack()) { pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -ENOSYS; + return -EOPNOTSUPP; + } + + + mutex_lock(&klp_mutex); + + ret = klp_init_patch_early(patch); + if (ret) { + mutex_unlock(&klp_mutex); + return ret; } - return klp_init_patch(patch); + ret = klp_init_patch(patch); + if (ret) + goto err; + + ret = __klp_enable_patch(patch); + if (ret) + goto err; + + mutex_unlock(&klp_mutex); + + return 0; + +err: + klp_free_patch_start(patch); + + mutex_unlock(&klp_mutex); + + klp_free_patch_finish(patch); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * This function removes replaced patches. + * + * We could be pretty aggressive here. It is called in the situation where + * these structures are no longer accessible. All functions are redirected + * by the klp_transition_patch. They use either a new code or they are in + * the original code because of the special nop function patches. + * + * The only exception is when the transition was forced. In this case, + * klp_ftrace_handler() might still see the replaced patch on the stack. + * Fortunately, it is carefully designed to work with removed functions + * thanks to RCU. We only have to keep the patches on the system. Also + * this is handled transparently by patch->module_put. + */ +void klp_discard_replaced_patches(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + + old_patch->enabled = false; + klp_unpatch_objects(old_patch); + klp_free_patch_start(old_patch); + schedule_work(&old_patch->free_work); + } +} + +/* + * This function removes the dynamically allocated 'nop' functions. + * + * We could be pretty aggressive. NOPs do not change the existing + * behavior except for adding unnecessary delay by the ftrace handler. + * + * It is safe even when the transition was forced. The ftrace handler + * will see a valid ops->func_stack entry thanks to RCU. + * + * We could even free the NOPs structures. They must be the last entry + * in ops->func_stack. Therefore unregister_ftrace_function() is called. + * It does the same as klp_synchronize_transition() to make sure that + * nobody is inside the ftrace handler once the operation finishes. + * + * IMPORTANT: It must be called right after removing the replaced patches! + */ +void klp_discard_nops(struct klp_patch *new_patch) +{ + klp_unpatch_objects_dynamic(klp_transition_patch); + klp_free_objects_dynamic(klp_transition_patch); } -EXPORT_SYMBOL_GPL(klp_register_patch); /* * Remove parts of patches that touch a given kernel module. The list of @@ -915,7 +1101,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod, struct klp_patch *patch; struct klp_object *obj; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { if (patch == limit) break; @@ -923,21 +1109,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod, if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); - if (patch != klp_transition_patch) - klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); - - klp_post_unpatch_callback(obj); - } + klp_post_unpatch_callback(obj); klp_free_object_loaded(obj); break; @@ -962,7 +1141,7 @@ int klp_module_coming(struct module *mod) */ mod->klp_alive = true; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; @@ -976,13 +1155,6 @@ int klp_module_coming(struct module *mod) goto err; } - /* - * Only patch the module if the patch is enabled or is - * in transition. - */ - if (!patch->enabled && patch != klp_transition_patch) - break; - pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 48a83d4364cf..ec43a40b853f 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -5,6 +5,17 @@ #include <linux/livepatch.h> extern struct mutex klp_mutex; +extern struct list_head klp_patches; + +#define klp_for_each_patch_safe(patch, tmp_patch) \ + list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list) + +#define klp_for_each_patch(patch) \ + list_for_each_entry(patch, &klp_patches, list) + +void klp_free_patch_start(struct klp_patch *patch); +void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) { diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 7702cb4064fc..99cb3ad05eb4 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -34,7 +34,7 @@ static LIST_HEAD(klp_ops); -struct klp_ops *klp_find_ops(unsigned long old_addr) +struct klp_ops *klp_find_ops(void *old_func) { struct klp_ops *ops; struct klp_func *func; @@ -42,7 +42,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr) list_for_each_entry(ops, &klp_ops, node) { func = list_first_entry(&ops->func_stack, struct klp_func, stack_node); - if (func->old_addr == old_addr) + if (func->old_func == old_func) return ops; } @@ -118,7 +118,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, } } + /* + * NOPs are used to replace existing patches with original code. + * Do nothing! Setting pc would cause an infinite loop. + */ + if (func->nop) + goto unlock; + klp_arch_set_pc(regs, (unsigned long)func->new_func); + unlock: preempt_enable_notrace(); } @@ -142,17 +150,18 @@ static void klp_unpatch_func(struct klp_func *func) if (WARN_ON(!func->patched)) return; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (WARN_ON(!ops)) return; if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -174,17 +183,18 @@ static int klp_patch_func(struct klp_func *func) struct klp_ops *ops; int ret; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return -EINVAL; if (WARN_ON(func->patched)) return -EINVAL; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (!ops) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); @@ -236,15 +246,26 @@ err: return ret; } -void klp_unpatch_object(struct klp_object *obj) +static void __klp_unpatch_object(struct klp_object *obj, bool nops_only) { struct klp_func *func; - klp_for_each_func(obj, func) + klp_for_each_func(obj, func) { + if (nops_only && !func->nop) + continue; + if (func->patched) klp_unpatch_func(func); + } - obj->patched = false; + if (obj->dynamic || !nops_only) + obj->patched = false; +} + + +void klp_unpatch_object(struct klp_object *obj) +{ + __klp_unpatch_object(obj, false); } int klp_patch_object(struct klp_object *obj) @@ -267,11 +288,21 @@ int klp_patch_object(struct klp_object *obj) return 0; } -void klp_unpatch_objects(struct klp_patch *patch) +static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only) { struct klp_object *obj; klp_for_each_object(patch, obj) if (obj->patched) - klp_unpatch_object(obj); + __klp_unpatch_object(obj, nops_only); +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, false); +} + +void klp_unpatch_objects_dynamic(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, true); } diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index e72d8250d04b..d5f2fbe373e0 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -10,7 +10,7 @@ * struct klp_ops - structure for tracking registered ftrace ops structs * * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch + * (klp_func structs) which have the same old_func. This allows the switch * between function versions to happen instantaneously by updating the klp_ops * struct's func_stack list. The winner is the klp_func at the top of the * func_stack (front of the list). @@ -25,10 +25,11 @@ struct klp_ops { struct ftrace_ops fops; }; -struct klp_ops *klp_find_ops(unsigned long old_addr); +struct klp_ops *klp_find_ops(void *old_func); int klp_patch_object(struct klp_object *obj); void klp_unpatch_object(struct klp_object *obj); void klp_unpatch_objects(struct klp_patch *patch); +void klp_unpatch_objects_dynamic(struct klp_patch *patch); #endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 304d5eb8a98c..9c89ae8b337a 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,11 +29,13 @@ #define MAX_STACK_ENTRIES 100 #define STACK_ERR_BUF_SIZE 128 +#define SIGNALS_TIMEOUT 15 + struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; -static bool klp_forced = false; +static unsigned int klp_signals_cnt; /* * This work can be performed periodically to finish patching or unpatching any @@ -87,6 +89,11 @@ static void klp_complete_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + klp_discard_replaced_patches(klp_transition_patch); + klp_discard_nops(klp_transition_patch); + } + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -136,13 +143,6 @@ static void klp_complete_transition(void) pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * klp_forced set implies unbounded increase of module's ref count if - * the module is disabled/enabled in a loop. - */ - if (!klp_forced && klp_target_state == KLP_UNPATCHED) - module_put(klp_transition_patch->mod); - klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } @@ -224,11 +224,11 @@ static int klp_check_stack_func(struct klp_func *func, * Check for the to-be-patched function * (the previous func). */ - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (list_is_singular(&ops->func_stack)) { /* original function */ - func_addr = func->old_addr; + func_addr = (unsigned long)func->old_func; func_size = func->old_size; } else { /* previously patched function */ @@ -348,6 +348,47 @@ done: } /* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. + */ +static void klp_send_signals(void) +{ + struct task_struct *g, *task; + + if (klp_signals_cnt == SIGNALS_TIMEOUT) + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} + +/* * Try to switch all remaining tasks to the target patch state by walking the * stacks of sleeping tasks and looking for any to-be-patched or * to-be-unpatched functions. If such functions are found, the task can't be @@ -359,6 +400,7 @@ void klp_try_complete_transition(void) { unsigned int cpu; struct task_struct *g, *task; + struct klp_patch *patch; bool complete = true; WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); @@ -396,6 +438,10 @@ void klp_try_complete_transition(void) put_online_cpus(); if (!complete) { + if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) + klp_send_signals(); + klp_signals_cnt++; + /* * Some tasks weren't able to be switched over. Try again * later and/or wait for other methods like kernel exit @@ -407,7 +453,18 @@ void klp_try_complete_transition(void) } /* we're done, now cleanup the data structures */ + patch = klp_transition_patch; klp_complete_transition(); + + /* + * It would make more sense to free the patch in + * klp_complete_transition() but it is called also + * from klp_cancel_transition(). + */ + if (!patch->enabled) { + klp_free_patch_start(patch); + schedule_work(&patch->free_work); + } } /* @@ -446,6 +503,8 @@ void klp_start_transition(void) if (task->patch_state != klp_target_state) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + + klp_signals_cnt = 0; } /* @@ -569,47 +628,6 @@ void klp_copy_process(struct task_struct *child) } /* - * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. - * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this - * action currently. - */ -void klp_send_signals(void) -{ - struct task_struct *g, *task; - - pr_notice("signaling remaining tasks\n"); - - read_lock(&tasklist_lock); - for_each_process_thread(g, task) { - if (!klp_patch_pending(task)) - continue; - - /* - * There is a small race here. We could see TIF_PATCH_PENDING - * set and decide to wake up a kthread or send a fake signal. - * Meanwhile the task could migrate itself and the action - * would be meaningless. It is not serious though. - */ - if (task->flags & PF_KTHREAD) { - /* - * Wake up a kthread which sleeps interruptedly and - * still has not been migrated. - */ - wake_up_state(task, TASK_INTERRUPTIBLE); - } else { - /* - * Send fake signal to all non-kthread tasks which are - * still not migrated. - */ - spin_lock_irq(&task->sighand->siglock); - signal_wake_up(task, 0); - spin_unlock_irq(&task->sighand->siglock); - } - } - read_unlock(&tasklist_lock); -} - -/* * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an * existing transition to finish. * @@ -620,6 +638,7 @@ void klp_send_signals(void) */ void klp_force_transition(void) { + struct klp_patch *patch; struct task_struct *g, *task; unsigned int cpu; @@ -633,5 +652,6 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_forced = true; + klp_for_each_patch(patch) + patch->forced = true; } diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index f9d0bc016067..322db16233de 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,7 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); -void klp_send_signals(void); void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..0b9aa8ab89f0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig { if (!debug) return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, mod->name)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); -#endif + ddebug_add_module(debug, num, mod->name); } static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) diff --git a/kernel/panic.c b/kernel/panic.c index f121e6ba7e11..0ae0d7332f12 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, - NULL, - clear_warn_once_set, - "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, + "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0200, NULL, - NULL, &clear_warn_once_fops); + debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, + &clear_warn_once_fops); return 0; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9dc2c38764a..7d66ee68aaaf 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> +#include <linux/debugfs.h> #include <linux/energy_model.h> #include <linux/sched/topology.h> #include <linux/slab.h> @@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data); */ static DEFINE_MUTEX(em_pd_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +{ + struct dentry *d; + char name[24]; + + snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + + /* Create per-cs directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_ulong("frequency", 0444, d, &cs->frequency); + debugfs_create_ulong("power", 0444, d, &cs->power); + debugfs_create_ulong("cost", 0444, d, &cs->cost); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +{ + struct dentry *d; + char name[8]; + int i; + + snprintf(name, sizeof(name), "pd%d", cpu); + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(name, rootdir); + + debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + + /* Create a sub-directory for each capacity state */ + for (i = 0; i < pd->nr_cap_states; i++) + em_debug_create_cs(&pd->table[i], d); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +core_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +#endif static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, struct em_data_callback *cb) { @@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, pd->nr_cap_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); + em_debug_create_pd(pd, cpu); + return pd; free_cs_table: diff --git a/kernel/power/qos.c b/kernel/power/qos.c index b7a82502857a..9d22131afc1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -582,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - if (d) { - (void)debugfs_create_file(qos->name, S_IRUGO, d, - (void *)qos, &pm_qos_debug_fops); - } + debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, + &pm_qos_debug_fops); return misc_register(&qos->pm_qos_power_miscdev); } @@ -685,8 +683,6 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); d = debugfs_create_dir("pm_qos", NULL); - if (IS_ERR_OR_NULL(d)) - d = NULL; for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d170374ceb..8eee85bb2687 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -344,7 +344,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; enum log_flags { LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -356,6 +355,9 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ +#ifdef CONFIG_PRINTK_CALLER + u32 caller_id; /* thread id or processor id */ +#endif } #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __packed __aligned(4) @@ -422,7 +424,11 @@ static u64 exclusive_console_stop_seq; static u64 clear_seq; static u32 clear_idx; +#ifdef CONFIG_PRINTK_CALLER +#define PREFIX_MAX 48 +#else #define PREFIX_MAX 32 +#endif #define LOG_LINE_MAX (1024 - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) @@ -577,7 +583,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static int log_store(int facility, int level, +static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) @@ -625,6 +631,9 @@ static int log_store(int facility, int level, msg->ts_nsec = ts_nsec; else msg->ts_nsec = local_clock(); +#ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; +#endif memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -688,12 +697,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; + char caller[20]; +#ifdef CONFIG_PRINTK_CALLER + u32 id = msg->caller_id; + + snprintf(caller, sizeof(caller), ",caller=%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); +#else + caller[0] = '\0'; +#endif do_div(ts_usec, 1000); - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-'); + return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -1038,6 +1056,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); +#ifdef CONFIG_PRINTK_CALLER + VMCOREINFO_OFFSET(printk_log, caller_id); +#endif } #endif @@ -1236,10 +1257,23 @@ static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); - return sprintf(buf, "[%5lu.%06lu] ", + return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } +#ifdef CONFIG_PRINTK_CALLER +static size_t print_caller(u32 id, char *buf) +{ + char caller[12]; + + snprintf(caller, sizeof(caller), "%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + return sprintf(buf, "[%6s]", caller); +} +#else +#define print_caller(id, buf) 0 +#endif + static size_t print_prefix(const struct printk_log *msg, bool syslog, bool time, char *buf) { @@ -1247,8 +1281,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, if (syslog) len = print_syslog((msg->facility << 3) | msg->level, buf); + if (time) len += print_time(msg->ts_nsec, buf + len); + + len += print_caller(msg->caller_id, buf + len); + + if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { + buf[len++] = ' '; + buf[len] = '\0'; + } + return len; } @@ -1752,6 +1795,12 @@ static inline void printk_delay(void) } } +static inline u32 printk_caller_id(void) +{ + return in_task() ? task_pid_nr(current) : + 0x80000000 + raw_smp_processor_id(); +} + /* * Continuation lines are buffered, and not committed to the record buffer * until the line is complete, or a race forces it. The line fragments @@ -1761,7 +1810,7 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - struct task_struct *owner; /* task of first print*/ + u32 caller_id; /* printk_caller_id() of first print */ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ @@ -1773,12 +1822,13 @@ static void cont_flush(void) if (cont.len == 0) return; - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.caller_id, cont.facility, cont.level, cont.flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.len = 0; } -static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) +static bool cont_add(u32 caller_id, int facility, int level, + enum log_flags flags, const char *text, size_t len) { /* If the line gets too long, split it up in separate records. */ if (cont.len + len > sizeof(cont.buf)) { @@ -1789,7 +1839,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * if (!cont.len) { cont.facility = facility; cont.level = level; - cont.owner = current; + cont.caller_id = caller_id; cont.ts_nsec = local_clock(); cont.flags = flags; } @@ -1809,13 +1859,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { + const u32 caller_id = printk_caller_id(); + /* * If an earlier line was buffered, and we're a continuation - * write from the same process, try to add it to the buffer. + * write from the same context, try to add it to the buffer. */ if (cont.len) { - if (cont.owner == current && (lflags & LOG_CONT)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Otherwise, make sure it's flushed */ @@ -1828,12 +1880,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Store it in the record log */ - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); + return log_store(caller_id, facility, level, lflags, 0, + dict, dictlen, text, text_len); } /* Must be called under logbuf_lock. */ @@ -1867,9 +1920,6 @@ int vprintk_store(int facility, int level, case '0' ... '7': if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; - /* fallthrough */ - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; break; case 'c': /* KERN_CONT */ lflags |= LOG_CONT; @@ -1884,7 +1934,7 @@ int vprintk_store(int facility, int level, level = default_message_loglevel; if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; + lflags |= LOG_NEWLINE; return log_output(facility, level, lflags, dict, dictlen, text, text_len); diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..e81b17b53fa5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -448,8 +448,6 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -481,8 +479,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3901b84d217..ead464a0f2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8213ff6e365d..ea74d43924b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index a43c601ac252..54a0347ca812 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -445,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable_noaudit(current_cred(), current_user_ns(), - CAP_SYS_ADMIN) != 0) + security_capable(current_cred(), current_user_ns(), + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ diff --git a/kernel/sys.c b/kernel/sys.c index f7eb62eceb24..12df0e5434b8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETUID)) { + if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETUID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - ns_capable(old->user_ns, CAP_SETUID)) { + ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; + /* fall through */ case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 85e5ccec0955..51d7c6794bf1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -48,6 +48,9 @@ COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ @@ -202,6 +205,7 @@ COND_SYSCALL(msgget); COND_SYSCALL(old_msgctl); COND_SYSCALL(msgctl); COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL_COMPAT(old_msgctl); COND_SYSCALL(msgrcv); COND_SYSCALL_COMPAT(msgrcv); COND_SYSCALL(msgsnd); @@ -212,6 +216,7 @@ COND_SYSCALL(semget); COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL_COMPAT(old_semctl); COND_SYSCALL(semtimedop); COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); @@ -221,6 +226,7 @@ COND_SYSCALL(shmget); COND_SYSCALL(old_shmctl); COND_SYSCALL(shmctl); COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL_COMPAT(old_shmctl); COND_SYSCALL(shmat); COND_SYSCALL_COMPAT(shmat); COND_SYSCALL(shmdt); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7c2b9bc88ee8..3fb1405f3f8c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,8 @@ #include <linux/bpf.h> #include <linux/mount.h> +#include "../lib/kstrtox.h" + #include <linux/uaccess.h> #include <asm/processor.h> @@ -127,6 +129,7 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -1471,7 +1474,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, @@ -1747,6 +1750,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2117,6 +2122,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v) } } +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer @@ -2160,7 +2200,8 @@ static int proc_get_long(char **buf, size_t *size, if (!isdigit(*p)) return -EINVAL; - *val = simple_strtoul(p, &p, 0); + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; len = p - tmp; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..217ef481fbbb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1301,7 +1301,7 @@ static int parse_pred(const char *str, void *data, /* go past the last quote */ i++; - } else if (isdigit(str[i])) { + } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { @@ -1314,6 +1314,9 @@ static int parse_pred(const char *str, void *data, goto err_free; } + if (str[i] == '-') + i++; + /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9eaf07f99212..99592c27465e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -865,7 +865,7 @@ fetch_store_strlen(unsigned long addr) u8 c; do { - ret = probe_mem_read(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 977918d5d350..8fbfda94a67b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); +static int __init watchdog_thresh_setup(char *str) +{ + get_option(&str, &watchdog_thresh); + return 1; +} +__setup("watchdog_thresh=", watchdog_thresh_setup); + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e14da4f599ab..4026d1871407 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -648,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible - * reordering can lead to a missed execution on attempt to qeueue + * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 @@ -920,6 +920,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * CONTEXT: * spin_lock_irq(rq->lock) * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. @@ -1343,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq) worker = current_wq_worker(); /* - * Return %true iff I'm a worker execuing a work item on @wq. If + * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; @@ -1514,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); +/** + * workqueue_select_cpu_near - Select a CPU based on NUMA node + * @node: NUMA node ID that we want to select a CPU from + * + * This function will attempt to find a "random" cpu available on a given + * node. If there are no CPUs available on the given node it will return + * WORK_CPU_UNBOUND indicating that we should just schedule to any + * available CPU if we need to schedule this work. + */ +static int workqueue_select_cpu_near(int node) +{ + int cpu; + + /* No point in doing this if NUMA isn't enabled for workqueues */ + if (!wq_numa_enabled) + return WORK_CPU_UNBOUND; + + /* Delay binding to CPU if node is not valid or online */ + if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) + return WORK_CPU_UNBOUND; + + /* Use local node/cpu if we are already there */ + cpu = raw_smp_processor_id(); + if (node == cpu_to_node(cpu)) + return cpu; + + /* Use "random" otherwise know as "first" online CPU of node */ + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + + /* If CPU is valid return that, otherwise just defer */ + return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; +} + +/** + * queue_work_node - queue work on a "random" cpu for a given NUMA node + * @node: NUMA node that we are targeting the work for + * @wq: workqueue to use + * @work: work to queue + * + * We queue the work to a "random" CPU within a given NUMA node. The basic + * idea here is to provide a way to somehow associate work with a given + * NUMA node. + * + * This function will only make a best effort attempt at getting this onto + * the right NUMA node. If no node is requested or the requested node is + * offline then we just fall back to standard queue_work behavior. + * + * Currently the "random" CPU ends up being the first available CPU in the + * intersection of cpu_online_mask and the cpumask of the node, unless we + * are running on the node. In that case we just use the current CPU. + * + * Return: %false if @work was already on a queue, %true otherwise. + */ +bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work) +{ + unsigned long flags; + bool ret = false; + + /* + * This current implementation is specific to unbound workqueues. + * Specifically we only return the first available CPU for a given + * node instead of cycling through individual CPUs within the node. + * + * If this is used with a per-cpu workqueue then the logic in + * workqueue_select_cpu_near would need to be updated to allow for + * some round robin type logic. + */ + WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); + + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + int cpu = workqueue_select_cpu_near(node); + + __queue_work(cpu, wq, work); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_node); + void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer); @@ -1641,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu) * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. - * While @rwork is guarnateed to be executed after a %false return, the + * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) @@ -2933,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!wq_online)) return false; + if (WARN_ON(!work->func)) + return false; + if (!from_cancel) { lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); |