summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 20:39:26 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 20:39:26 -0700
commitc54b245d011855ea91c5beff07f1db74143ce614 (patch)
tree7982f28c76f578997f342c7255e222dabeffde63 /kernel
parente17c120f48f7d86ed9fd6e44e9436d32997fd9ec (diff)
parent5e6b8a50a7cec5686ee2c4bda1d49899c79a7eae (diff)
downloadlwn-c54b245d011855ea91c5beff07f1db74143ce614.tar.gz
lwn-c54b245d011855ea91c5beff07f1db74143ce614.zip
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace rlimit handling update from Eric Biederman: "This is the work mainly by Alexey Gladkov to limit rlimits to the rlimits of the user that created a user namespace, and to allow users to have stricter limits on the resources created within a user namespace." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: cred: add missing return error code when set_cred_ucounts() failed ucounts: Silence warning in dec_rlimit_ucounts ucounts: Set ucount_max to the largest positive value the type can hold kselftests: Add test to check for rlimit changes in different user namespaces Reimplement RLIMIT_MEMLOCK on top of ucounts Reimplement RLIMIT_SIGPENDING on top of ucounts Reimplement RLIMIT_MSGQUEUE on top of ucounts Reimplement RLIMIT_NPROC on top of ucounts Use atomic_t for ucounts reference counting Add a reference to ucounts for each cred Increase size of ucounts to atomic_long_t
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cred.c51
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/sys.c14
-rw-r--r--kernel/ucount.c116
-rw-r--r--kernel/user.c3
-rw-r--r--kernel/user_namespace.c9
8 files changed, 186 insertions, 52 deletions
diff --git a/kernel/cred.c b/kernel/cred.c
index e1d274cd741b..e6fd2b3fc31f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -60,6 +60,7 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
+ .ucounts = &init_ucounts,
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ if (cred->ucounts)
+ put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
+ new->ucounts = get_ucounts(&init_ucounts);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -284,6 +288,11 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
+
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
validate_creds(new);
return new;
@@ -351,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -363,6 +372,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
+ ret = set_cred_ucounts(new);
+ if (ret < 0)
+ goto error_put;
}
#ifdef CONFIG_KEYS
@@ -384,8 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
@@ -485,12 +497,12 @@ int commit_creds(struct cred *new)
* in set_user().
*/
alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user)
- atomic_dec(&old->user->processes);
+ dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
/* send notifications */
@@ -653,6 +665,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
}
EXPORT_SYMBOL(cred_fscmp);
+int set_cred_ucounts(struct cred *new)
+{
+ struct task_struct *task = current;
+ const struct cred *old = task->real_cred;
+ struct ucounts *old_ucounts = new->ucounts;
+
+ if (new->user == old->user && new->user_ns == old->user_ns)
+ return 0;
+
+ /*
+ * This optimization is needed because alloc_ucounts() uses locks
+ * for table lookups.
+ */
+ if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ return 0;
+
+ if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ return -EAGAIN;
+
+ if (old_ucounts)
+ put_ucounts(old_ucounts);
+
+ return 0;
+}
+
/*
* initialise the credentials stuff
*/
@@ -719,6 +756,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
put_cred(old);
validate_creds(new);
return new;
diff --git a/kernel/exit.c b/kernel/exit.c
index 65809fac3038..9a89e7f36acb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -188,7 +188,7 @@ repeat:
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4820c2aaeb5a..b4386ff6a641 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -825,9 +825,14 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
- for (i = 0; i < UCOUNT_COUNTS; i++)
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache);
@@ -1978,8 +1983,7 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
@@ -2388,7 +2392,7 @@ bad_fork_cleanup_threadgroup_lock:
#endif
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
@@ -3001,6 +3005,12 @@ int ksys_unshare(unsigned long unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
+ if (new_cred) {
+ err = set_cred_ucounts(new_cred);
+ if (err)
+ goto bad_unshare_cleanup_cred;
+ }
+
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 20d1d896d5b0..de0920353d30 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -412,8 +412,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
- struct user_struct *user;
- int sigpending;
+ struct ucounts *ucounts = NULL;
+ long sigpending;
/*
* Protect access to @t credentials. This can go away when all
@@ -424,27 +424,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
* changes from/to zero.
*/
rcu_read_lock();
- user = __task_cred(t)->user;
- sigpending = atomic_inc_return(&user->sigpending);
+ ucounts = task_ucounts(t);
+ sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
if (sigpending == 1)
- get_uid(user);
+ ucounts = get_ucounts(ucounts);
rcu_read_unlock();
- if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
+ if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
- if (atomic_dec_and_test(&user->sigpending))
- free_uid(user);
+ if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
+ put_ucounts(ucounts);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags;
- q->user = user;
+ q->ucounts = ucounts;
}
-
return q;
}
@@ -452,8 +451,10 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (atomic_dec_and_test(&q->user->sigpending))
- free_uid(q->user);
+ if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
+ put_ucounts(q->ucounts);
+ q->ucounts = NULL;
+ }
kmem_cache_free(sigqueue_cachep, q);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 9de46a4bf492..ef1a78f5d71c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -479,7 +479,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage.
*/
- if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
new_user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
@@ -558,6 +558,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -616,6 +620,10 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -691,6 +699,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;
+ retval = set_cred_ucounts(new);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8d8874f1c35e..87799e2379bd 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,12 @@
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>
+struct ucounts init_ucounts = {
+ .ns = &init_user_ns,
+ .uid = GLOBAL_ROOT_UID,
+ .count = ATOMIC_INIT(1),
+};
+
#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);
@@ -78,6 +84,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_fanotify_groups"),
UCOUNT_ENTRY("max_fanotify_marks"),
#endif
+ { },
+ { },
+ { },
+ { },
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -129,7 +139,24 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
return NULL;
}
-static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+static void hlist_add_ucounts(struct ucounts *ucounts)
+{
+ struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
+ spin_lock_irq(&ucounts_lock);
+ hlist_add_head(&ucounts->node, hashent);
+ spin_unlock_irq(&ucounts_lock);
+}
+
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+ if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+ put_ucounts(ucounts);
+ ucounts = NULL;
+ }
+ return ucounts;
+}
+
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
@@ -145,7 +172,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns;
new->uid = uid;
- new->count = 0;
+ atomic_set(&new->count, 1);
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -153,40 +180,35 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
kfree(new);
} else {
hlist_add_head(&new->node, hashent);
- ucounts = new;
+ spin_unlock_irq(&ucounts_lock);
+ return new;
}
}
- if (ucounts->count == INT_MAX)
- ucounts = NULL;
- else
- ucounts->count += 1;
spin_unlock_irq(&ucounts_lock);
+ ucounts = get_ucounts(ucounts);
return ucounts;
}
-static void put_ucounts(struct ucounts *ucounts)
+void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- spin_lock_irqsave(&ucounts_lock, flags);
- ucounts->count -= 1;
- if (!ucounts->count)
+ if (atomic_dec_and_test(&ucounts->count)) {
+ spin_lock_irqsave(&ucounts_lock, flags);
hlist_del_init(&ucounts->node);
- else
- ucounts = NULL;
- spin_unlock_irqrestore(&ucounts_lock, flags);
-
- kfree(ucounts);
+ spin_unlock_irqrestore(&ucounts_lock, flags);
+ kfree(ucounts);
+ }
}
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
- int c, old;
- c = atomic_read(v);
+ long c, old;
+ c = atomic_long_read(v);
for (;;) {
if (unlikely(c >= u))
return false;
- old = atomic_cmpxchg(v, c, c+1);
+ old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c))
return true;
c = old;
@@ -198,19 +220,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
{
struct ucounts *ucounts, *iter, *bad;
struct user_namespace *tns;
- ucounts = get_ucounts(ns, uid);
+ ucounts = alloc_ucounts(ns, uid);
for (iter = ucounts; iter; iter = tns->ucounts) {
- int max;
+ long max;
tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]);
- if (!atomic_inc_below(&iter->ucount[type], max))
+ if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail;
}
return ucounts;
fail:
bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
- atomic_dec(&iter->ucount[type]);
+ atomic_long_dec(&iter->ucount[type]);
put_ucounts(ucounts);
return NULL;
@@ -220,12 +242,54 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- int dec = atomic_dec_if_positive(&iter->ucount[type]);
+ long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
}
put_ucounts(ucounts);
}
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ long ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long max = READ_ONCE(iter->ns->ucount_max[type]);
+ long new = atomic_long_add_return(v, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ ret = LONG_MAX;
+ else if (iter == ucounts)
+ ret = new;
+ }
+ return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+ struct ucounts *iter;
+ long new = -1; /* Silence compiler warning */
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ if (iter == ucounts)
+ new = dec;
+ }
+ return (new == 0);
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+ struct ucounts *iter;
+ if (get_ucounts_value(ucounts, type) > max)
+ return true;
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ max = READ_ONCE(iter->ns->ucount_max[type]);
+ if (get_ucounts_value(iter, type) > max)
+ return true;
+ }
+ return false;
+}
+
static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
@@ -241,6 +305,8 @@ static __init int user_namespace_sysctl_init(void)
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
+ hlist_add_ucounts(&init_ucounts);
+ inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
diff --git a/kernel/user.c b/kernel/user.c
index a2478cddf536..c82399c1618a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .processes = ATOMIC_INIT(1),
- .sigpending = ATOMIC_INIT(0),
- .locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
};
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8d62863721b0..ef82d401dde8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -119,9 +119,13 @@ int create_user_ns(struct cred *new)
ns->owner = owner;
ns->group = group;
INIT_WORK(&ns->work, free_user_ns);
- for (i = 0; i < UCOUNT_COUNTS; i++) {
+ for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
@@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns));
+ if (set_cred_ucounts(cred) < 0)
+ return -EINVAL;
+
return 0;
}