From 115085ea0794c0f339be8f9d25505c7f9861d824 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 6 Dec 2006 20:36:51 -0800 Subject: [PATCH] taskstats: cleanup do_exit() path do_exit: taskstats_exit_alloc() ... taskstats_exit_send() taskstats_exit_free() I think this is not good, let it be a single function exported to the core kernel, taskstats_exit(), which does alloc + send + free itself. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Shailabh Nagar Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 06de6c4e8ca3..4e3f919edc48 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; - struct taskstats *tidstats; int group_dead; - unsigned int mycpu; profile_task_exit(tsk); @@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats, &mycpu); - acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead, mycpu); - taskstats_exit_free(tidstats); + + taskstats_exit(tsk, group_dead); exit_mm(tsk); -- cgit v1.2.3 From 24ec839c431eb79bb8f6abc00c4e1eb3b8c4d517 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Dec 2006 02:36:04 -0800 Subject: [PATCH] tty: ->signal->tty locking Fix the locking of signal->tty. Use ->sighand->siglock to protect ->signal->tty; this lock is already used by most other members of ->signal/->sighand. And unless we are 'current' or the tasklist_lock is held we need ->siglock to access ->signal anyway. (NOTE: sys_unshare() is broken wrt ->sighand locking rules) Note that tty_mutex is held over tty destruction, so while holding tty_mutex any tty pointer remains valid. Otherwise the lifetime of ttys are governed by their open file handles. This leaves some holes for tty access from signal->tty (or any other non file related tty access). It solves the tty SLAB scribbles we were seeing. (NOTE: the change from group_send_sig_info to __group_send_sig_info needs to be examined by someone familiar with the security framework, I think it is safe given the SEND_SIG_PRIV from other __group_send_sig_info invocations) [schwidefsky@de.ibm.com: 3270 fix] [akpm@osdl.org: various post-viro fixes] Signed-off-by: Peter Zijlstra Acked-by: Alan Cox Cc: Oleg Nesterov Cc: Prarit Bhargava Cc: Chris Wright Cc: Roland McGrath Cc: Stephen Smalley Cc: James Morris Cc: "David S. Miller" Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Jan Kara Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc64/solaris/misc.c | 4 +- arch/um/kernel/exec.c | 7 +- drivers/char/tty_io.c | 237 ++++++++++++++++++++++++++------------------ drivers/s390/char/fs3270.c | 12 ++- fs/dquot.c | 14 +-- fs/open.c | 1 + include/linux/tty.h | 11 +- kernel/acct.c | 9 +- kernel/auditsc.c | 2 + kernel/exit.c | 4 +- kernel/sys.c | 6 +- security/selinux/hooks.c | 11 +- 12 files changed, 186 insertions(+), 132 deletions(-) (limited to 'kernel/exit.c') diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 9ed997982f8d..e84241d5f7f4 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -423,9 +423,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) Solaris setpgrp and setsid? */ ret = sys_setpgid(0, 0); if (ret) return ret; - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); return process_group(current); } case 2: /* getsid */ diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 0561c43b4685..8d56ec6cca79 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -39,12 +39,13 @@ static long execve1(char *file, char __user * __user *argv, char __user *__user *env) { long error; + struct tty_struct *tty; #ifdef CONFIG_TTY_LOG mutex_lock(&tty_mutex); - task_lock(current); /* FIXME: is this needed ? */ - log_exec(argv, current->signal->tty); - task_unlock(current); + tty = get_current_tty(); + if (tty) + log_exec(argv, tty); mutex_unlock(&tty_mutex); #endif error = do_execve(file, argv, env, ¤t->thread.regs); diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index b3cfc8bc613c..0c856a6f3677 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -126,7 +126,7 @@ EXPORT_SYMBOL(tty_std_termios); LIST_HEAD(tty_drivers); /* linked list of tty drivers */ -/* Semaphore to protect creating and releasing a tty. This is shared with +/* Mutex to protect creating and releasing a tty. This is shared with vt.c for deeply disgusting hack reasons */ DEFINE_MUTEX(tty_mutex); EXPORT_SYMBOL(tty_mutex); @@ -250,7 +250,7 @@ static int check_tty_count(struct tty_struct *tty, const char *routine) "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); return count; - } + } #endif return 0; } @@ -259,18 +259,6 @@ static int check_tty_count(struct tty_struct *tty, const char *routine) * Tty buffer allocation management */ - -/** - * tty_buffer_free_all - free buffers used by a tty - * @tty: tty to free from - * - * Remove all the buffers pending on a tty whether queued with data - * or in the free ring. Must be called when the tty is no longer in use - * - * Locking: none - */ - - /** * tty_buffer_free_all - free buffers used by a tty * @tty: tty to free from @@ -614,7 +602,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string_flags); * they are not on hot paths so a little discipline won't do * any harm. * - * Locking: takes termios_sem + * Locking: takes termios_mutex */ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) @@ -915,7 +903,7 @@ static void tty_ldisc_enable(struct tty_struct *tty) * context. * * Locking: takes tty_ldisc_lock. - * called functions take termios_sem + * called functions take termios_mutex */ static int tty_set_ldisc(struct tty_struct *tty, int ldisc) @@ -1267,12 +1255,12 @@ EXPORT_SYMBOL_GPL(tty_ldisc_flush); * * Locking: * BKL - * redirect lock for undoing redirection - * file list lock for manipulating list of ttys - * tty_ldisc_lock from called functions - * termios_sem resetting termios data - * tasklist_lock to walk task list for hangup event - * + * redirect lock for undoing redirection + * file list lock for manipulating list of ttys + * tty_ldisc_lock from called functions + * termios_mutex resetting termios data + * tasklist_lock to walk task list for hangup event + * ->siglock to protect ->signal/->sighand */ static void do_tty_hangup(struct work_struct *work) { @@ -1354,14 +1342,18 @@ static void do_tty_hangup(struct work_struct *work) read_lock(&tasklist_lock); if (tty->session > 0) { do_each_task_pid(tty->session, PIDTYPE_SID, p) { + spin_lock_irq(&p->sighand->siglock); if (p->signal->tty == tty) p->signal->tty = NULL; - if (!p->signal->leader) + if (!p->signal->leader) { + spin_unlock_irq(&p->sighand->siglock); continue; - group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); - group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); + } + __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); + __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) p->signal->tty_old_pgrp = tty->pgrp; + spin_unlock_irq(&p->sighand->siglock); } while_each_task_pid(tty->session, PIDTYPE_SID, p); } read_unlock(&tasklist_lock); @@ -1453,6 +1445,14 @@ int tty_hung_up_p(struct file * filp) EXPORT_SYMBOL(tty_hung_up_p); +static void session_clear_tty(pid_t session) +{ + struct task_struct *p; + do_each_task_pid(session, PIDTYPE_SID, p) { + proc_clear_tty(p); + } while_each_task_pid(session, PIDTYPE_SID, p); +} + /** * disassociate_ctty - disconnect controlling tty * @on_exit: true if exiting so need to "hang up" the session @@ -1469,31 +1469,35 @@ EXPORT_SYMBOL(tty_hung_up_p); * The argument on_exit is set to 1 if called when a process is * exiting; it is 0 if called by the ioctl TIOCNOTTY. * - * Locking: tty_mutex is taken to protect current->signal->tty + * Locking: * BKL is taken for hysterical raisins - * Tasklist lock is taken (under tty_mutex) to walk process - * lists for the session. + * tty_mutex is taken to protect tty + * ->siglock is taken to protect ->signal/->sighand + * tasklist_lock is taken to walk process list for sessions + * ->siglock is taken to protect ->signal/->sighand */ void disassociate_ctty(int on_exit) { struct tty_struct *tty; - struct task_struct *p; int tty_pgrp = -1; + int session; lock_kernel(); mutex_lock(&tty_mutex); - tty = current->signal->tty; + tty = get_current_tty(); if (tty) { tty_pgrp = tty->pgrp; mutex_unlock(&tty_mutex); + /* XXX: here we race, there is nothing protecting tty */ if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) tty_vhangup(tty); } else { - if (current->signal->tty_old_pgrp) { - kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit); - kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit); + pid_t old_pgrp = current->signal->tty_old_pgrp; + if (old_pgrp) { + kill_pg(old_pgrp, SIGHUP, on_exit); + kill_pg(old_pgrp, SIGCONT, on_exit); } mutex_unlock(&tty_mutex); unlock_kernel(); @@ -1505,19 +1509,29 @@ void disassociate_ctty(int on_exit) kill_pg(tty_pgrp, SIGCONT, on_exit); } - /* Must lock changes to tty_old_pgrp */ - mutex_lock(&tty_mutex); + spin_lock_irq(¤t->sighand->siglock); current->signal->tty_old_pgrp = 0; - tty->session = 0; - tty->pgrp = -1; + session = current->signal->session; + spin_unlock_irq(¤t->sighand->siglock); + + mutex_lock(&tty_mutex); + /* It is possible that do_tty_hangup has free'd this tty */ + tty = get_current_tty(); + if (tty) { + tty->session = 0; + tty->pgrp = 0; + } else { +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "error attempted to write to tty [0x%p]" + " = NULL", tty); +#endif + } + mutex_unlock(&tty_mutex); /* Now clear signal->tty under the lock */ read_lock(&tasklist_lock); - do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); + session_clear_tty(session); read_unlock(&tasklist_lock); - mutex_unlock(&tty_mutex); unlock_kernel(); } @@ -2337,16 +2351,10 @@ static void release_dev(struct file * filp) * tty. */ if (tty_closing || o_tty_closing) { - struct task_struct *p; - read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); + session_clear_tty(tty->session); if (o_tty) - do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); + session_clear_tty(o_tty->session); read_unlock(&tasklist_lock); } @@ -2443,9 +2451,9 @@ static void release_dev(struct file * filp) * The termios state of a pty is reset on first open so that * settings don't persist across reuse. * - * Locking: tty_mutex protects current->signal->tty, get_tty_driver and - * init_dev work. tty->count should protect the rest. - * task_lock is held to update task details for sessions + * Locking: tty_mutex protects tty, get_tty_driver and init_dev work. + * tty->count should protect the rest. + * ->siglock protects ->signal/->sighand */ static int tty_open(struct inode * inode, struct file * filp) @@ -2467,12 +2475,13 @@ retry_open: mutex_lock(&tty_mutex); if (device == MKDEV(TTYAUX_MAJOR,0)) { - if (!current->signal->tty) { + tty = get_current_tty(); + if (!tty) { mutex_unlock(&tty_mutex); return -ENXIO; } - driver = current->signal->tty->driver; - index = current->signal->tty->index; + driver = tty->driver; + index = tty->index; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ goto got_driver; @@ -2547,17 +2556,16 @@ got_driver: filp->f_op = &tty_fops; goto retry_open; } + + mutex_lock(&tty_mutex); + spin_lock_irq(¤t->sighand->siglock); if (!noctty && current->signal->leader && !current->signal->tty && - tty->session == 0) { - task_lock(current); - current->signal->tty = tty; - task_unlock(current); - current->signal->tty_old_pgrp = 0; - tty->session = current->signal->session; - tty->pgrp = process_group(current); - } + tty->session == 0) + __proc_set_tty(current, tty); + spin_unlock_irq(¤t->sighand->siglock); + mutex_unlock(&tty_mutex); return 0; } @@ -2747,7 +2755,7 @@ static int tiocsti(struct tty_struct *tty, char __user *p) * * Copies the kernel idea of the window size into the user buffer. * - * Locking: tty->termios_sem is taken to ensure the winsize data + * Locking: tty->termios_mutex is taken to ensure the winsize data * is consistent. */ @@ -2774,8 +2782,8 @@ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg) * Locking: * Called function use the console_sem is used to ensure we do * not try and resize the console twice at once. - * The tty->termios_sem is used to ensure we don't double - * resize and get confused. Lock order - tty->termios.sem before + * The tty->termios_mutex is used to ensure we don't double + * resize and get confused. Lock order - tty->termios_mutex before * console sem */ @@ -2880,25 +2888,28 @@ static int fionbio(struct file *file, int __user *p) * leader to set this tty as the controlling tty for the session. * * Locking: - * Takes tasklist lock internally to walk sessions - * Takes task_lock() when updating signal->tty * Takes tty_mutex() to protect tty instance - * + * Takes tasklist_lock internally to walk sessions + * Takes ->siglock() when updating signal->tty */ static int tiocsctty(struct tty_struct *tty, int arg) { - struct task_struct *p; - + int ret = 0; if (current->signal->leader && (current->signal->session == tty->session)) - return 0; + return ret; + + mutex_lock(&tty_mutex); /* * The process must be a session leader and * not have a controlling tty already. */ - if (!current->signal->leader || current->signal->tty) - return -EPERM; + if (!current->signal->leader || current->signal->tty) { + ret = -EPERM; + goto unlock; + } + if (tty->session > 0) { /* * This tty is already the controlling @@ -2908,24 +2919,18 @@ static int tiocsctty(struct tty_struct *tty, int arg) /* * Steal it away */ - read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); + session_clear_tty(tty->session); read_unlock(&tasklist_lock); - } else - return -EPERM; + } else { + ret = -EPERM; + goto unlock; + } } - mutex_lock(&tty_mutex); - task_lock(current); - current->signal->tty = tty; - task_unlock(current); + proc_set_tty(current, tty); +unlock: mutex_unlock(&tty_mutex); - current->signal->tty_old_pgrp = 0; - tty->session = current->signal->session; - tty->pgrp = process_group(current); - return 0; + return ret; } /** @@ -2937,7 +2942,7 @@ static int tiocsctty(struct tty_struct *tty, int arg) * Obtain the process group of the tty. If there is no process group * return an error. * - * Locking: none. Reference to ->signal->tty is safe. + * Locking: none. Reference to current->signal->tty is safe. */ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) @@ -2995,7 +3000,7 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t * Obtain the session id of the tty. If there is no session * return an error. * - * Locking: none. Reference to ->signal->tty is safe. + * Locking: none. Reference to current->signal->tty is safe. */ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) @@ -3214,14 +3219,11 @@ int tty_ioctl(struct inode * inode, struct file * file, clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNOTTY: - /* FIXME: taks lock or tty_mutex ? */ if (current->signal->tty != tty) return -ENOTTY; if (current->signal->leader) disassociate_ctty(0); - task_lock(current); - current->signal->tty = NULL; - task_unlock(current); + proc_clear_tty(current); return 0; case TIOCSCTTY: return tiocsctty(tty, arg); @@ -3321,7 +3323,7 @@ static void __do_SAK(struct work_struct *work) if (!tty) return; - session = tty->session; + session = tty->session; /* We don't want an ldisc switch during this */ disc = tty_ldisc_ref(tty); @@ -3834,9 +3836,52 @@ int tty_unregister_driver(struct tty_driver *driver) cdev_del(&driver->cdev); return 0; } - EXPORT_SYMBOL(tty_unregister_driver); +dev_t tty_devnum(struct tty_struct *tty) +{ + return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; +} +EXPORT_SYMBOL(tty_devnum); + +void proc_clear_tty(struct task_struct *p) +{ + spin_lock_irq(&p->sighand->siglock); + p->signal->tty = NULL; + spin_unlock_irq(&p->sighand->siglock); +} +EXPORT_SYMBOL(proc_clear_tty); + +void __proc_set_tty(struct task_struct *tsk, struct tty_struct *tty) +{ + if (tty) { + tty->session = tsk->signal->session; + tty->pgrp = process_group(tsk); + } + tsk->signal->tty = tty; + tsk->signal->tty_old_pgrp = 0; +} + +void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty) +{ + spin_lock_irq(&tsk->sighand->siglock); + __proc_set_tty(tsk, tty); + spin_unlock_irq(&tsk->sighand->siglock); +} + +struct tty_struct *get_current_tty(void) +{ + struct tty_struct *tty; + WARN_ON_ONCE(!mutex_is_locked(&tty_mutex)); + tty = current->signal->tty; + /* + * session->tty can be changed/cleared from under us, make sure we + * issue the load. The obtained pointer, when not NULL, is valid as + * long as we hold tty_mutex. + */ + barrier(); + return tty; +} /* * Initialize the console device. This is called *early*, so diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c index 78f8bda81dae..ef205ddd31a4 100644 --- a/drivers/s390/char/fs3270.c +++ b/drivers/s390/char/fs3270.c @@ -424,11 +424,15 @@ fs3270_open(struct inode *inode, struct file *filp) minor = iminor(filp->f_dentry->d_inode); /* Check for minor 0 multiplexer. */ if (minor == 0) { - if (!current->signal->tty) + struct tty_struct *tty; + mutex_lock(&tty_mutex); + tty = get_current_tty(); + if (!tty || tty->driver->major != IBM_TTY3270_MAJOR) { + mutex_unlock(&tty_mutex); return -ENODEV; - if (current->signal->tty->driver->major != IBM_TTY3270_MAJOR) - return -ENODEV; - minor = current->signal->tty->index + RAW3270_FIRSTMINOR; + } + minor = tty->index + RAW3270_FIRSTMINOR; + mutex_unlock(&tty_mutex); } /* Check if some other program is already using fullscreen mode. */ fp = (struct fs3270 *) raw3270_find_view(&fs3270_fn, minor); diff --git a/fs/dquot.c b/fs/dquot.c index f9cd5e23ebdf..89066b19124d 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -828,6 +828,7 @@ static inline int need_print_warning(struct dquot *dquot) static void print_warning(struct dquot *dquot, const char warntype) { char *msg = NULL; + struct tty_struct *tty; int flag = (warntype == BHARDWARN || warntype == BSOFTLONGWARN) ? DQ_BLKS_B : ((warntype == IHARDWARN || warntype == ISOFTLONGWARN) ? DQ_INODES_B : 0); @@ -835,14 +836,15 @@ static void print_warning(struct dquot *dquot, const char warntype) return; mutex_lock(&tty_mutex); - if (!current->signal->tty) + tty = get_current_tty(); + if (!tty) goto out_lock; - tty_write_message(current->signal->tty, dquot->dq_sb->s_id); + tty_write_message(tty, dquot->dq_sb->s_id); if (warntype == ISOFTWARN || warntype == BSOFTWARN) - tty_write_message(current->signal->tty, ": warning, "); + tty_write_message(tty, ": warning, "); else - tty_write_message(current->signal->tty, ": write failed, "); - tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]); + tty_write_message(tty, ": write failed, "); + tty_write_message(tty, quotatypes[dquot->dq_type]); switch (warntype) { case IHARDWARN: msg = " file limit reached.\r\n"; @@ -863,7 +865,7 @@ static void print_warning(struct dquot *dquot, const char warntype) msg = " block quota exceeded.\r\n"; break; } - tty_write_message(current->signal->tty, msg); + tty_write_message(tty, msg); out_lock: mutex_unlock(&tty_mutex); } diff --git a/fs/open.c b/fs/open.c index 89e0c237a636..3b56192816ca 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1087,6 +1087,7 @@ EXPORT_SYMBOL(sys_close); asmlinkage long sys_vhangup(void) { if (capable(CAP_SYS_TTY_CONFIG)) { + /* XXX: this needs locking */ tty_vhangup(current->signal->tty); return 0; } diff --git a/include/linux/tty.h b/include/linux/tty.h index f717f0898238..1d29999a3439 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -309,6 +309,12 @@ extern void tty_ldisc_flush(struct tty_struct *tty); extern int tty_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); +extern dev_t tty_devnum(struct tty_struct *tty); +extern void proc_clear_tty(struct task_struct *p); +extern void __proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); +extern void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); +extern struct tty_struct *get_current_tty(void); + extern struct mutex tty_mutex; /* n_tty.c */ @@ -335,10 +341,5 @@ extern void console_print(const char *); extern int vt_ioctl(struct tty_struct *tty, struct file * file, unsigned int cmd, unsigned long arg); -static inline dev_t tty_devnum(struct tty_struct *tty) -{ - return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; -} - #endif /* __KERNEL__ */ #endif diff --git a/kernel/acct.c b/kernel/acct.c index dc12db8600e7..ca5619039367 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -428,6 +428,7 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; /* * First check to see if there is enough free_space to continue @@ -485,12 +486,8 @@ static void do_acct_process(struct file *file) #endif mutex_lock(&tty_mutex); - /* FIXME: Whoever is responsible for current->signal locking needs - to use the same locking all over the kernel and document it */ - read_lock(&tasklist_lock); - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); + tty = get_current_tty(); + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; mutex_unlock(&tty_mutex); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40722e26de98..b6cb802fbcd1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->return_code); mutex_lock(&tty_mutex); + read_lock(&tasklist_lock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; + read_unlock(&tasklist_lock); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" diff --git a/kernel/exit.c b/kernel/exit.c index 4e3f919edc48..fa235779b6a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -384,9 +384,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - mutex_lock(&tty_mutex); - current->signal->tty = NULL; - mutex_unlock(&tty_mutex); + proc_clear_tty(current); /* Block and flush all signals */ sigfillset(&blocked); diff --git a/kernel/sys.c b/kernel/sys.c index a0c1a29a507f..1ac2d1c5d84e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1484,7 +1484,6 @@ asmlinkage long sys_setsid(void) pid_t session; int err = -EPERM; - mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); /* Fail if I am already a session leader */ @@ -1504,12 +1503,15 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(session, session); + + spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - mutex_unlock(&tty_mutex); return err; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 44e9cd470543..f5df8c70a9b5 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1695,9 +1695,10 @@ static inline void flush_unauthorized_files(struct files_struct * files) struct tty_struct *tty; struct fdtable *fdt; long j = -1; + int drop_tty = 0; mutex_lock(&tty_mutex); - tty = current->signal->tty; + tty = get_current_tty(); if (tty) { file_list_lock(); file = list_entry(tty->tty_files.next, typeof(*file), f_u.fu_list); @@ -1710,12 +1711,14 @@ static inline void flush_unauthorized_files(struct files_struct * files) struct inode *inode = file->f_dentry->d_inode; if (inode_has_perm(current, inode, FILE__READ | FILE__WRITE, NULL)) { - /* Reset controlling tty. */ - current->signal->tty = NULL; - current->signal->tty_old_pgrp = 0; + drop_tty = 1; } } file_list_unlock(); + + /* Reset controlling tty. */ + if (drop_tty) + proc_set_tty(current, NULL); } mutex_unlock(&tty_mutex); -- cgit v1.2.3 From ae424ae4b5bcd820ad6ee6f0b986c4e14ed4d6cf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:36:08 -0800 Subject: [PATCH] make set_special_pids() static Make set_special_pids() static, the only caller is daemonize(). Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/exit.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index dede82c63445..5e8a0ba61749 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1240,7 +1240,6 @@ extern struct mm_struct init_mm; #define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) extern struct task_struct *find_task_by_pid_type(int type, int pid); -extern void set_special_pids(pid_t session, pid_t pgrp); extern void __set_special_pids(pid_t session, pid_t pgrp); /* per-UID process charging. */ diff --git a/kernel/exit.c b/kernel/exit.c index fa235779b6a3..fa0495e167e9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -314,7 +314,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) } } -void set_special_pids(pid_t session, pid_t pgrp) +static void set_special_pids(pid_t session, pid_t pgrp) { write_lock_irq(&tasklist_lock); __set_special_pids(session, pgrp); -- cgit v1.2.3 From 937949d9edbf4049bd41af6c9f92c26280584564 Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:54 -0800 Subject: [PATCH] add process_session() helper routine Replace occurences of task->signal->session by a new process_session() helper routine. It will be useful for pid namespaces to abstract the session pid number. Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/irixelf.c | 2 +- drivers/char/mxser.c | 2 +- drivers/char/rocket.c | 2 +- drivers/char/tty_io.c | 12 ++++++------ fs/binfmt_elf.c | 4 ++-- fs/binfmt_elf_fdpic.c | 4 ++-- include/linux/sched.h | 5 +++++ kernel/exit.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/signal.c | 2 +- kernel/sys.c | 8 ++++---- 11 files changed, 35 insertions(+), 29 deletions(-) (limited to 'kernel/exit.c') diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index 1bbefbf43373..37cad5de515c 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c @@ -1145,7 +1145,7 @@ static int irix_core_dump(long signr, struct pt_regs * regs, struct file *file) psinfo.pr_pid = prstatus.pr_pid = current->pid; psinfo.pr_ppid = prstatus.pr_ppid = current->parent->pid; psinfo.pr_pgrp = prstatus.pr_pgrp = process_group(current); - psinfo.pr_sid = prstatus.pr_sid = current->signal->session; + psinfo.pr_sid = prstatus.pr_sid = process_session(current); if (current->pid == current->tgid) { /* * This is the record for the group leader. Add in the diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index 5ed2486b7581..b1cc89c89820 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -994,7 +994,7 @@ static int mxser_open(struct tty_struct *tty, struct file *filp) mxser_change_speed(info, NULL); } - info->session = current->signal->session; + info->session = process_session(current); info->pgrp = process_group(current); /* diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index bac80056f7e0..4fdf52e9f3b1 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -1017,7 +1017,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp) /* * Info->count is now 1; so it's safe to sleep now. */ - info->session = current->signal->session; + info->session = process_session(current); info->pgrp = process_group(current); if ((info->flags & ROCKET_INITIALIZED) == 0) { diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 8a2d26e98ac4..4ebba7ca1dc9 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -1511,7 +1511,7 @@ void disassociate_ctty(int on_exit) spin_lock_irq(¤t->sighand->siglock); current->signal->tty_old_pgrp = 0; - session = current->signal->session; + session = process_session(current); spin_unlock_irq(¤t->sighand->siglock); mutex_lock(&tty_mutex); @@ -2897,7 +2897,7 @@ static int tiocsctty(struct tty_struct *tty, int arg) { int ret = 0; if (current->signal->leader && - (current->signal->session == tty->session)) + (process_session(current) == tty->session)) return ret; mutex_lock(&tty_mutex); @@ -2979,13 +2979,13 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t return retval; if (!current->signal->tty || (current->signal->tty != real_tty) || - (real_tty->session != current->signal->session)) + (real_tty->session != process_session(current))) return -ENOTTY; if (get_user(pgrp, p)) return -EFAULT; if (pgrp < 0) return -EINVAL; - if (session_of_pgrp(pgrp) != current->signal->session) + if (session_of_pgrp(pgrp) != process_session(current)) return -EPERM; real_tty->pgrp = pgrp; return 0; @@ -3338,7 +3338,7 @@ static void __do_SAK(struct work_struct *work) /* Kill the entire session */ do_each_task_pid(session, PIDTYPE_SID, p) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): p->signal->session==tty->session\n", + " (%s): process_session(p)==tty->session\n", p->pid, p->comm); send_sig(SIGKILL, p, 1); } while_each_task_pid(session, PIDTYPE_SID, p); @@ -3348,7 +3348,7 @@ static void __do_SAK(struct work_struct *work) do_each_thread(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): p->signal->session==tty->session\n", + " (%s): process_session(p)==tty->session\n", p->pid, p->comm); send_sig(SIGKILL, p, 1); continue; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c6dbb4a7ec78..d3adfd353ff9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1317,7 +1317,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->signal->session; + prstatus->pr_sid = process_session(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1363,7 +1363,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->signal->session; + psinfo->pr_sid = process_session(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9f0b7efc3df5..76f06f6bc2f6 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1322,7 +1322,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->signal->session; + prstatus->pr_sid = process_session(p); if (thread_group_leader(p)) { /* * This is the record for the group leader. Add in the @@ -1371,7 +1371,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->signal->session; + psinfo->pr_sid = process_session(p); i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e8a0ba61749..270d864a8ff1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,6 +1047,11 @@ static inline pid_t process_group(struct task_struct *tsk) return tsk->signal->pgrp; } +static inline pid_t process_session(struct task_struct *tsk) +{ + return tsk->signal->session; +} + static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; diff --git a/kernel/exit.c b/kernel/exit.c index fa0495e167e9..8d289bfc13d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -193,14 +193,14 @@ int session_of_pgrp(int pgrp) read_lock(&tasklist_lock); do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (p->signal->session > 0) { - sid = p->signal->session; + if (process_session(p) > 0) { + sid = process_session(p); goto out; } } while_each_task_pid(pgrp, PIDTYPE_PGID, p); p = find_task_by_pid(pgrp); if (p) - sid = p->signal->session; + sid = process_session(p); out: read_unlock(&tasklist_lock); @@ -225,8 +225,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp - && p->real_parent->signal->session == p->signal->session) { + if (process_group(p->real_parent) != pgrp && + process_session(p->real_parent) == process_session(p)) { ret = 0; break; } @@ -302,7 +302,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current->group_leader; - if (curr->signal->session != session) { + if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); curr->signal->session = session; attach_pid(curr, PIDTYPE_SID, session); @@ -647,10 +647,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->signal->session == father->signal->session)) { + (process_session(p) == process_session(father))) { int pgrp = process_group(p); - if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { + if (will_become_orphaned_pgrp(pgrp, NULL) && + has_stopped_jobs(pgrp)) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); } @@ -784,7 +785,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->signal->session == tsk->signal->session) && + (process_session(t) == process_session(tsk)) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 597707819327..298c4d6ab512 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1259,9 +1259,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; + p->signal->session = process_session(current); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); + attach_pid(p, PIDTYPE_SID, process_session(p)); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; diff --git a/kernel/signal.c b/kernel/signal.c index ec81defde339..9eac4db60eda 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -583,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && ((sig != SIGCONT) || - (current->signal->session != t->signal->session)) + (process_session(current) != process_session(t))) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) diff --git a/kernel/sys.c b/kernel/sys.c index 1ac2d1c5d84e..4f9d23a3095f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != group_leader->signal->session) + if (process_session(p) != process_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1400,7 +1400,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *p; do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == group_leader->signal->session) + if (process_session(p) == process_session(group_leader)) goto ok_pgid; } while_each_task_pid(pgid, PIDTYPE_PGID, p); goto out; @@ -1459,7 +1459,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) - return current->signal->session; + return process_session(current); else { int retval; struct task_struct *p; @@ -1471,7 +1471,7 @@ asmlinkage long sys_getsid(pid_t pid) if (p) { retval = security_task_getsid(p); if (!retval) - retval = p->signal->session; + retval = process_session(p); } read_unlock(&tasklist_lock); return retval; -- cgit v1.2.3 From 1ec320afdc9552c92191d5f89fcd1ebe588334ca Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 8 Dec 2006 02:37:55 -0800 Subject: [PATCH] add process_session() helper routine: deprecate old field Add an anonymous union and ((deprecated)) to catch direct usage of the session field. [akpm@osdl.org: fix various missed conversions] [jdike@addtoit.com: fix UML bug] Signed-off-by: Jeff Dike Cc: Cedric Le Goater Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 2 +- fs/proc/array.c | 2 +- include/linux/init_task.h | 11 ++++++----- include/linux/sched.h | 19 +++++++++++++++++-- kernel/exit.c | 2 +- kernel/fork.c | 2 +- 6 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel/exit.c') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 4ebba7ca1dc9..48cee2004e97 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -3855,7 +3855,7 @@ EXPORT_SYMBOL(proc_clear_tty); void __proc_set_tty(struct task_struct *tsk, struct tty_struct *tty) { if (tty) { - tty->session = tsk->signal->session; + tty->session = process_session(tsk); tty->pgrp = process_group(tsk); } tsk->signal->tty = tty; diff --git a/fs/proc/array.c b/fs/proc/array.c index b0cd014a39bd..70e4fab117b1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -381,7 +381,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) stime = cputime_add(stime, sig->stime); } - sid = sig->session; + sid = signal_session(sig); pgid = process_group(task); ppid = rcu_dereference(task->real_parent)->tgid; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 733790d4f7db..848a68af3d42 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -57,17 +57,18 @@ .cpu_vm_mask = CPU_MASK_ALL, \ } -#define INIT_SIGNALS(sig) { \ - .count = ATOMIC_INIT(1), \ +#define INIT_SIGNALS(sig) { \ + .count = ATOMIC_INIT(1), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ - .shared_pending = { \ + .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ - .signal = {{0}}}, \ + .signal = {{0}}}, \ .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .pgrp = 1, \ - .session = 1, \ + .tty_old_pgrp = 0, \ + { .__session = 1}, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 270d864a8ff1..6fec1d419714 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -436,7 +436,12 @@ struct signal_struct { /* job control IDs */ pid_t pgrp; pid_t tty_old_pgrp; - pid_t session; + + union { + pid_t session __deprecated; + pid_t __session; + }; + /* boolean value for session group leader */ int leader; @@ -1047,9 +1052,19 @@ static inline pid_t process_group(struct task_struct *tsk) return tsk->signal->pgrp; } +static inline pid_t signal_session(struct signal_struct *sig) +{ + return sig->__session; +} + static inline pid_t process_session(struct task_struct *tsk) { - return tsk->signal->session; + return signal_session(tsk->signal); +} + +static inline void set_signal_session(struct signal_struct *sig, pid_t session) +{ + sig->__session = session; } static inline struct pid *task_pid(struct task_struct *task) diff --git a/kernel/exit.c b/kernel/exit.c index 8d289bfc13d1..6267a6cc6113 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -304,7 +304,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) if (process_session(curr) != session) { detach_pid(curr, PIDTYPE_SID); - curr->signal->session = session; + set_signal_session(curr->signal, session); attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { diff --git a/kernel/fork.c b/kernel/fork.c index 298c4d6ab512..60d2644bfe85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1259,7 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); - p->signal->session = process_session(current); + set_signal_session(p->signal, process_session(current)); attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, process_session(p)); -- cgit v1.2.3 From 6b3286ed1169d74fea401367d6d4d6c6ec758a81 Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Fri, 8 Dec 2006 02:37:56 -0800 Subject: [PATCH] rename struct namespace to struct mnt_namespace Rename 'struct namespace' to 'struct mnt_namespace' to avoid confusion with other namespaces being developped for the containers : pid, uts, ipc, etc. 'namespace' variables and attributes are also renamed to 'mnt_ns' Signed-off-by: Kirill Korotaev Signed-off-by: Cedric Le Goater Cc: Eric W. Biederman Cc: Herbert Poetzl Cc: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/mntpt.c | 2 +- fs/namespace.c | 113 +++++++++++++++++++++--------------------- fs/nfs/getroot.c | 2 +- fs/pnode.c | 2 +- fs/pnode.h | 2 +- fs/proc/base.c | 36 +++++++------- fs/reiserfs/super.c | 2 +- include/linux/init_task.h | 2 +- include/linux/mnt_namespace.h | 42 ++++++++++++++++ include/linux/mount.h | 4 +- include/linux/namespace.h | 42 ---------------- include/linux/nsproxy.h | 4 +- kernel/exit.c | 2 +- kernel/fork.c | 21 ++++---- kernel/kmod.c | 2 +- kernel/nsproxy.c | 16 +++--- 16 files changed, 148 insertions(+), 146 deletions(-) create mode 100644 include/linux/mnt_namespace.h delete mode 100644 include/linux/namespace.h (limited to 'kernel/exit.c') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index f33b1a81a761..8f74e8450826 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "super.h" #include "cell.h" #include "volume.h" diff --git a/fs/namespace.c b/fs/namespace.c index b00ac84ebbdd..fde8553faa76 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -133,10 +133,10 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) static inline int check_mnt(struct vfsmount *mnt) { - return mnt->mnt_namespace == current->nsproxy->namespace; + return mnt->mnt_ns == current->nsproxy->mnt_ns; } -static void touch_namespace(struct namespace *ns) +static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; @@ -144,7 +144,7 @@ static void touch_namespace(struct namespace *ns) } } -static void __touch_namespace(struct namespace *ns) +static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; @@ -187,19 +187,19 @@ static void commit_tree(struct vfsmount *mnt) struct vfsmount *parent = mnt->mnt_parent; struct vfsmount *m; LIST_HEAD(head); - struct namespace *n = parent->mnt_namespace; + struct mnt_namespace *n = parent->mnt_ns; BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) - m->mnt_namespace = n; + m->mnt_ns = n; list_splice(&head, n->list.prev); list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(parent, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - touch_namespace(n); + touch_mnt_namespace(n); } static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) @@ -320,7 +320,7 @@ EXPORT_SYMBOL(mnt_unpin); /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct namespace *n = m->private; + struct mnt_namespace *n = m->private; struct list_head *p; loff_t l = *pos; @@ -333,7 +333,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct namespace *n = m->private; + struct mnt_namespace *n = m->private; struct list_head *p = ((struct vfsmount *)v)->mnt_list.next; (*pos)++; return p == &n->list ? NULL : list_entry(p, struct vfsmount, mnt_list); @@ -526,8 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) list_for_each_entry(p, kill, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_namespace(p->mnt_namespace); - p->mnt_namespace = NULL; + __touch_mnt_namespace(p->mnt_ns); + p->mnt_ns = NULL; list_del_init(&p->mnt_child); if (p->mnt_parent != p) p->mnt_mountpoint->d_mounted--; @@ -830,7 +830,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_nd) { detach_mnt(source_mnt, parent_nd); attach_mnt(source_mnt, nd); - touch_namespace(current->nsproxy->namespace); + touch_mnt_namespace(current->nsproxy->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1145,9 +1145,9 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, */ if (!propagate_mount_busy(mnt, 2)) { /* delete from the namespace */ - touch_namespace(mnt->mnt_namespace); + touch_mnt_namespace(mnt->mnt_ns); list_del_init(&mnt->mnt_list); - mnt->mnt_namespace = NULL; + mnt->mnt_ns = NULL; umount_tree(mnt, 1, umounts); spin_unlock(&vfsmount_lock); } else { @@ -1168,7 +1168,7 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, */ static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts) { - struct namespace *namespace; + struct mnt_namespace *ns; struct vfsmount *mnt; while (!list_empty(graveyard)) { @@ -1178,10 +1178,10 @@ static void expire_mount_list(struct list_head *graveyard, struct list_head *mou /* don't do anything if the namespace is dead - all the * vfsmounts from it are going away anyway */ - namespace = mnt->mnt_namespace; - if (!namespace || !namespace->root) + ns = mnt->mnt_ns; + if (!ns || !ns->root) continue; - get_namespace(namespace); + get_mnt_ns(ns); spin_unlock(&vfsmount_lock); down_write(&namespace_sem); @@ -1189,7 +1189,7 @@ static void expire_mount_list(struct list_head *graveyard, struct list_head *mou up_write(&namespace_sem); release_mounts(&umounts); mntput(mnt); - put_namespace(namespace); + put_mnt_ns(ns); spin_lock(&vfsmount_lock); } } @@ -1439,14 +1439,15 @@ dput_out: * Allocate a new namespace structure and populate it with contents * copied from the namespace of the passed in task structure. */ -struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) +struct mnt_namespace *dup_mnt_ns(struct task_struct *tsk, + struct fs_struct *fs) { - struct namespace *namespace = tsk->nsproxy->namespace; - struct namespace *new_ns; + struct mnt_namespace *mnt_ns = tsk->nsproxy->mnt_ns; + struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct vfsmount *p, *q; - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return NULL; @@ -1457,7 +1458,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) down_write(&namespace_sem); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root, + new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, CL_COPY_ALL | CL_EXPIRE); if (!new_ns->root) { up_write(&namespace_sem); @@ -1473,10 +1474,10 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = namespace->root; + p = mnt_ns->root; q = new_ns->root; while (p) { - q->mnt_namespace = new_ns; + q->mnt_ns = new_ns; if (fs) { if (p == fs->rootmnt) { rootmnt = p; @@ -1491,7 +1492,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) fs->altrootmnt = mntget(q); } } - p = next_mnt(p, namespace->root); + p = next_mnt(p, mnt_ns->root); q = next_mnt(q, new_ns->root); } up_write(&namespace_sem); @@ -1506,16 +1507,16 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) return new_ns; } -int copy_namespace(int flags, struct task_struct *tsk) +int copy_mnt_ns(int flags, struct task_struct *tsk) { - struct namespace *namespace = tsk->nsproxy->namespace; - struct namespace *new_ns; + struct mnt_namespace *ns = tsk->nsproxy->mnt_ns; + struct mnt_namespace *new_ns; int err = 0; - if (!namespace) + if (!ns) return 0; - get_namespace(namespace); + get_mnt_ns(ns); if (!(flags & CLONE_NEWNS)) return 0; @@ -1525,16 +1526,16 @@ int copy_namespace(int flags, struct task_struct *tsk) goto out; } - new_ns = dup_namespace(tsk, tsk->fs); + new_ns = dup_mnt_ns(tsk, tsk->fs); if (!new_ns) { err = -ENOMEM; goto out; } - tsk->nsproxy->namespace = new_ns; + tsk->nsproxy->mnt_ns = new_ns; out: - put_namespace(namespace); + put_mnt_ns(ns); return err; } @@ -1754,7 +1755,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ - touch_namespace(current->nsproxy->namespace); + touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1779,27 +1780,27 @@ out3: static void __init init_mount_tree(void) { struct vfsmount *mnt; - struct namespace *namespace; + struct mnt_namespace *ns; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - namespace = kmalloc(sizeof(*namespace), GFP_KERNEL); - if (!namespace) + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) panic("Can't allocate initial namespace"); - atomic_set(&namespace->count, 1); - INIT_LIST_HEAD(&namespace->list); - init_waitqueue_head(&namespace->poll); - namespace->event = 0; - list_add(&mnt->mnt_list, &namespace->list); - namespace->root = mnt; - mnt->mnt_namespace = namespace; - - init_task.nsproxy->namespace = namespace; - get_namespace(namespace); - - set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); - set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); + atomic_set(&ns->count, 1); + INIT_LIST_HEAD(&ns->list); + init_waitqueue_head(&ns->poll); + ns->event = 0; + list_add(&mnt->mnt_list, &ns->list); + ns->root = mnt; + mnt->mnt_ns = ns; + + init_task.nsproxy->mnt_ns = ns; + get_mnt_ns(ns); + + set_fs_pwd(current->fs, ns->root, ns->root->mnt_root); + set_fs_root(current->fs, ns->root, ns->root->mnt_root); } void __init mnt_init(unsigned long mempages) @@ -1860,11 +1861,11 @@ void __init mnt_init(unsigned long mempages) init_mount_tree(); } -void __put_namespace(struct namespace *namespace) +void __put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root = namespace->root; + struct vfsmount *root = ns->root; LIST_HEAD(umount_list); - namespace->root = NULL; + ns->root = NULL; spin_unlock(&vfsmount_lock); down_write(&namespace_sem); spin_lock(&vfsmount_lock); @@ -1872,5 +1873,5 @@ void __put_namespace(struct namespace *namespace) spin_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - kfree(namespace); + kfree(ns); } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 20c6f39ea38a..8391bd7a83ce 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/pnode.c b/fs/pnode.c index da42ee61c1df..56aacead8362 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -6,7 +6,7 @@ * Author : Ram Pai (linuxram@us.ibm.com) * */ -#include +#include #include #include #include "pnode.h" diff --git a/fs/pnode.h b/fs/pnode.h index 020e1bb60fdb..d45bd8ec36bf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -13,7 +13,7 @@ #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(mnt) (mnt->mnt_master) -#define IS_MNT_NEW(mnt) (!mnt->mnt_namespace) +#define IS_MNT_NEW(mnt) (!mnt->mnt_ns) #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(mnt) (mnt->mnt_flags & MNT_UNBINDABLE) diff --git a/fs/proc/base.c b/fs/proc/base.c index a71f1755bb57..a3b5074118a7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include @@ -365,33 +365,33 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { struct task_struct *task = get_proc_task(inode); - struct namespace *namespace = NULL; + struct mnt_namespace *ns = NULL; struct proc_mounts *p; int ret = -EINVAL; if (task) { task_lock(task); - namespace = task->nsproxy->namespace; - if (namespace) - get_namespace(namespace); + ns = task->nsproxy->mnt_ns; + if (ns) + get_mnt_ns(ns); task_unlock(task); put_task_struct(task); } - if (namespace) { + if (ns) { ret = -ENOMEM; p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); if (p) { file->private_data = &p->m; ret = seq_open(file, &mounts_op); if (!ret) { - p->m.private = namespace; - p->event = namespace->event; + p->m.private = ns; + p->event = ns->event; return 0; } kfree(p); } - put_namespace(namespace); + put_mnt_ns(ns); } return ret; } @@ -399,15 +399,15 @@ static int mounts_open(struct inode *inode, struct file *file) static int mounts_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - struct namespace *namespace = m->private; - put_namespace(namespace); + struct mnt_namespace *ns = m->private; + put_mnt_ns(ns); return seq_release(inode, file); } static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct namespace *ns = p->m.private; + struct mnt_namespace *ns = p->m.private; unsigned res = 0; poll_wait(file, &ns->poll, wait); @@ -437,21 +437,21 @@ static int mountstats_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace = NULL; + struct mnt_namespace *mnt_ns = NULL; struct task_struct *task = get_proc_task(inode); if (task) { task_lock(task); if (task->nsproxy) - namespace = task->nsproxy->namespace; - if (namespace) - get_namespace(namespace); + mnt_ns = task->nsproxy->mnt_ns; + if (mnt_ns) + get_mnt_ns(mnt_ns); task_unlock(task); put_task_struct(task); } - if (namespace) - m->private = namespace; + if (mnt_ns) + m->private = mnt_ns; else { seq_release(inode, file); ret = -EINVAL; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7fb5fb036f90..58ad4551a7c1 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 848a68af3d42..5c4989172f7e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -76,7 +76,7 @@ extern struct nsproxy init_nsproxy; .count = ATOMIC_INIT(1), \ .nslock = __SPIN_LOCK_UNLOCKED(nsproxy.nslock), \ .uts_ns = &init_uts_ns, \ - .namespace = NULL, \ + .mnt_ns = NULL, \ INIT_IPC_NS(ipc_ns) \ } diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h new file mode 100644 index 000000000000..4af0b1fc282a --- /dev/null +++ b/include/linux/mnt_namespace.h @@ -0,0 +1,42 @@ +#ifndef _NAMESPACE_H_ +#define _NAMESPACE_H_ +#ifdef __KERNEL__ + +#include +#include +#include + +struct mnt_namespace { + atomic_t count; + struct vfsmount * root; + struct list_head list; + wait_queue_head_t poll; + int event; +}; + +extern int copy_mnt_ns(int, struct task_struct *); +extern void __put_mnt_ns(struct mnt_namespace *ns); +extern struct mnt_namespace *dup_mnt_ns(struct task_struct *, + struct fs_struct *); + +static inline void put_mnt_ns(struct mnt_namespace *ns) +{ + if (atomic_dec_and_lock(&ns->count, &vfsmount_lock)) + /* releases vfsmount_lock */ + __put_mnt_ns(ns); +} + +static inline void exit_mnt_ns(struct task_struct *p) +{ + struct mnt_namespace *ns = p->nsproxy->mnt_ns; + if (ns) + put_mnt_ns(ns); +} + +static inline void get_mnt_ns(struct mnt_namespace *ns) +{ + atomic_inc(&ns->count); +} + +#endif +#endif diff --git a/include/linux/mount.h b/include/linux/mount.h index 403d1a97c512..e357dc86a4de 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -20,7 +20,7 @@ struct super_block; struct vfsmount; struct dentry; -struct namespace; +struct mnt_namespace; #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 @@ -52,7 +52,7 @@ struct vfsmount { struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */ - struct namespace *mnt_namespace; /* containing namespace */ + struct mnt_namespace *mnt_ns; /* containing namespace */ int mnt_pinned; }; diff --git a/include/linux/namespace.h b/include/linux/namespace.h deleted file mode 100644 index d137009f0b2b..000000000000 --- a/include/linux/namespace.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _NAMESPACE_H_ -#define _NAMESPACE_H_ -#ifdef __KERNEL__ - -#include -#include -#include - -struct namespace { - atomic_t count; - struct vfsmount * root; - struct list_head list; - wait_queue_head_t poll; - int event; -}; - -extern int copy_namespace(int, struct task_struct *); -extern void __put_namespace(struct namespace *namespace); -extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *); - -static inline void put_namespace(struct namespace *namespace) -{ - if (atomic_dec_and_lock(&namespace->count, &vfsmount_lock)) - /* releases vfsmount_lock */ - __put_namespace(namespace); -} - -static inline void exit_namespace(struct task_struct *p) -{ - struct namespace *namespace = p->nsproxy->namespace; - if (namespace) { - put_namespace(namespace); - } -} - -static inline void get_namespace(struct namespace *namespace) -{ - atomic_inc(&namespace->count); -} - -#endif -#endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 971d1c6dfc4b..0aba1b1a39c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -4,7 +4,7 @@ #include #include -struct namespace; +struct mnt_namespace; struct uts_namespace; struct ipc_namespace; @@ -25,7 +25,7 @@ struct nsproxy { spinlock_t nslock; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; - struct namespace *namespace; + struct mnt_namespace *mnt_ns; }; extern struct nsproxy init_nsproxy; diff --git a/kernel/exit.c b/kernel/exit.c index 6267a6cc6113..28d9feedfd27 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 60d2644bfe85..8c859eef8e6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -1525,17 +1525,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the namespace structure if it is being shared + * Unshare the mnt_namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +static int unshare_mnt_namespace(unsigned long unshare_flags, + struct mnt_namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->nsproxy->namespace; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); if (!*new_nsp) return -ENOMEM; } @@ -1623,7 +1624,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct namespace *ns, *new_ns = NULL; + struct mnt_namespace *ns, *new_ns = NULL; struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; @@ -1645,7 +1646,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; @@ -1686,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->nsproxy->namespace; - current->nsproxy->namespace = new_ns; + ns = current->nsproxy->mnt_ns; + current->nsproxy->mnt_ns = new_ns; new_ns = ns; } @@ -1748,7 +1749,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_ns: if (new_ns) - put_namespace(new_ns); + put_mnt_ns(new_ns); bad_unshare_cleanup_fs: if (new_fs) diff --git a/kernel/kmod.c b/kernel/kmod.c index 8d2bea09a4ec..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb7335a..bd9cb435dfe0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); @@ -60,8 +60,8 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) struct nsproxy *ns = clone_namespaces(orig); if (ns) { - if (ns->namespace) - get_namespace(ns->namespace); + if (ns->mnt_ns) + get_mnt_ns(ns->mnt_ns); if (ns->uts_ns) get_uts_ns(ns->uts_ns); if (ns->ipc_ns) @@ -97,7 +97,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) tsk->nsproxy = new_ns; - err = copy_namespace(flags, tsk); + err = copy_mnt_ns(flags, tsk); if (err) goto out_ns; @@ -117,8 +117,8 @@ out_ipc: if (new_ns->uts_ns) put_uts_ns(new_ns->uts_ns); out_uts: - if (new_ns->namespace) - put_namespace(new_ns->namespace); + if (new_ns->mnt_ns) + put_mnt_ns(new_ns->mnt_ns); out_ns: tsk->nsproxy = old_ns; kfree(new_ns); @@ -127,8 +127,8 @@ out_ns: void free_nsproxy(struct nsproxy *ns) { - if (ns->namespace) - put_namespace(ns->namespace); + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); if (ns->uts_ns) put_uts_ns(ns->uts_ns); if (ns->ipc_ns) -- cgit v1.2.3 From 84d737866e2babdeab0c6b18ea155c6a649663b8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Fri, 8 Dec 2006 02:38:01 -0800 Subject: [PATCH] add child reaper to pid_namespace Add a per pid_namespace child-reaper. This is needed so processes are reaped within the same pid space and do not spill over to the parent pid space. Its also needed so containers preserve existing semantic that pid == 1 would reap orphaned children. This is based on Eric Biederman's patch: http://lkml.org/lkml/2006/2/6/285 Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Cedric Le Goater Cc: Kirill Korotaev Cc: Eric W. Biederman Cc: Herbert Poetzl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++-- include/linux/pid.h | 5 +++-- include/linux/pid_namespace.h | 6 ++++++ include/linux/sched.h | 1 - init/main.c | 5 ++--- kernel/exit.c | 23 +++++++++++++++-------- kernel/pid.c | 3 ++- kernel/signal.c | 11 +++++++++-- 8 files changed, 40 insertions(+), 19 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 60433e2254a4..12d8cd461b41 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -620,8 +621,8 @@ static int de_thread(struct task_struct *tsk) * Reparenting needs write_lock on tasklist_lock, * so it is safe to do it under read_lock. */ - if (unlikely(tsk->group_leader == child_reaper)) - child_reaper = tsk; + if (unlikely(tsk->group_leader == child_reaper(tsk))) + tsk->nsproxy->pid_ns->child_reaper = tsk; zap_other_threads(tsk); read_unlock(&tasklist_lock); diff --git a/include/linux/pid.h b/include/linux/pid.h index 2c0007d17218..4dec047b1837 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -35,8 +35,9 @@ enum pid_type * * Holding a reference to struct pid solves both of these problems. * It is small so holding a reference does not consume a lot of - * resources, and since a new struct pid is allocated when the numeric - * pid value is reused we don't mistakenly refer to new processes. + * resources, and since a new struct pid is allocated when the numeric pid + * value is reused (when pids wrap around) we don't mistakenly refer to new + * processes. */ struct pid diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 76e7c6b2cf33..d2a9d419f01f 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -19,6 +19,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + struct task_struct *child_reaper; }; extern struct pid_namespace init_pid_ns; @@ -36,4 +37,9 @@ static inline void put_pid_ns(struct pid_namespace *ns) kref_put(&ns->kref, free_pid_ns); } +static inline struct task_struct *child_reaper(struct task_struct *tsk) +{ + return tsk->nsproxy->pid_ns->child_reaper; +} + #endif /* _LINUX_PID_NS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fec1d419714..f0317edea141 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,7 +1400,6 @@ extern NORET_TYPE void do_group_exit(int); extern void daemonize(const char *, ...); extern int allow_signal(int); extern int disallow_signal(int); -extern struct task_struct *child_reaper; extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); diff --git a/init/main.c b/init/main.c index 4cdcd06e6d78..036f97c0c34c 100644 --- a/init/main.c +++ b/init/main.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -626,8 +627,6 @@ static int __init initcall_debug_setup(char *str) } __setup("initcall_debug", initcall_debug_setup); -struct task_struct *child_reaper = &init_task; - extern initcall_t __initcall_start[], __initcall_end[]; static void __init do_initcalls(void) @@ -727,7 +726,7 @@ static int init(void * unused) * assumptions about where in the task array this * can be found. */ - child_reaper = current; + init_pid_ns.child_reaper = current; cad_pid = task_pid(current); diff --git a/kernel/exit.c b/kernel/exit.c index 28d9feedfd27..fd0e067952ab 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,7 +49,6 @@ #include extern void sem_exit (void); -extern struct task_struct *child_reaper; static void exit_mm(struct task_struct * tsk); @@ -260,7 +260,8 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task. + * reparent_to_init - Reparent the calling kernel thread to the init task + * of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -278,8 +279,8 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ remove_parent(current); - current->parent = child_reaper; - current->real_parent = child_reaper; + current->parent = child_reaper(current); + current->real_parent = child_reaper(current); add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -662,7 +663,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * When we die, we re-parent all our children. * Try to give them to another thread in our thread * group, and if no such member exists, give it to - * the global child reaper process (ie "init") + * the child reaper process (ie "init") in our pid + * space. */ static void forget_original_parent(struct task_struct *father, struct list_head *to_release) @@ -673,7 +675,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) do { reaper = next_thread(reaper); if (reaper == father) { - reaper = child_reaper; + reaper = child_reaper(father); break; } } while (reaper->exit_state); @@ -859,8 +861,13 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk == child_reaper)) - panic("Attempted to kill init!"); + if (unlikely(tsk == child_reaper(tsk))) { + if (tsk->nsproxy->pid_ns != &init_pid_ns) + tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; + else + panic("Attempted to kill init!"); + } + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; diff --git a/kernel/pid.c b/kernel/pid.c index 1d9cc268b499..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -65,7 +65,8 @@ struct pid_namespace init_pid_ns = { .pidmap = { [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }, - .last_pid = 0 + .last_pid = 0, + .child_reaper = &init_task }; /* diff --git a/kernel/signal.c b/kernel/signal.c index 9eac4db60eda..1921ffdc5e77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,9 @@ #include #include #include +#include +#include + #include #include #include @@ -1877,8 +1880,12 @@ relock: if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; - /* Init gets no signals it doesn't want. */ - if (current == child_reaper) + /* + * Init of a pid space gets no signals it doesn't want from + * within that pid space. It can of course get signals from + * its parent pid space. + */ + if (current == child_reaper(current)) continue; if (sig_kernel_stop(signr)) { -- cgit v1.2.3 From 62dfb5541a025b47df9405ff0219c7829a97d83b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Dec 2006 02:38:03 -0800 Subject: [PATCH] session_of_pgrp: kill unnecessary do_each_task_pid(PIDTYPE_PGID) All members of the process group have the same sid and it can't be == 0. NOTE: this code (and a similar one in sys_setpgid) was needed because it was possibe to have ->session == 0. It's not possible any longer since [PATCH] pidhash: don't use zero pids Commit: c7c6464117a02b0d54feb4ebeca4db70fa493678 Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index fd0e067952ab..03e64fe4a14a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -189,21 +189,18 @@ repeat: int session_of_pgrp(int pgrp) { struct task_struct *p; - int sid = -1; + int sid = 0; read_lock(&tasklist_lock); - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { - if (process_session(p) > 0) { - sid = process_session(p); - goto out; - } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); - p = find_task_by_pid(pgrp); - if (p) + + p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + if (p == NULL) + p = find_task_by_pid(pgrp); + if (p != NULL) sid = process_session(p); -out: + read_unlock(&tasklist_lock); - + return sid; } -- cgit v1.2.3 From bbea9f69668a3d0cf9feba15a724cd02896f8675 Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Sun, 10 Dec 2006 02:21:12 -0800 Subject: [PATCH] fdtable: Make fdarray and fdsets equal in size Currently, each fdtable supports three dynamically-sized arrays of data: the fdarray and two fdsets. The code allows the number of fds supported by the fdarray (fdtable->max_fds) to differ from the number of fds supported by each of the fdsets (fdtable->max_fdset). In practice, it is wasteful for these two sizes to differ: whenever we hit a limit on the smaller-capacity structure, we will reallocate the entire fdtable and all the dynamic arrays within it, so any delta in the memory used by the larger-capacity structure will never be touched at all. Rather than hogging this excess, we shouldn't even allocate it in the first place, and keep the capacities of the fdarray and the fdsets equal. This patch removes fdtable->max_fdset. As an added bonus, most of the supporting code becomes simpler. Signed-off-by: Vadim Lobanov Cc: Christoph Hellwig Cc: Al Viro Cc: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/osf_sys.c | 6 ++--- arch/mips/kernel/kspd.c | 2 +- fs/compat.c | 10 ++++---- fs/exec.c | 2 +- fs/fcntl.c | 5 ++-- fs/file.c | 56 ++++++++++++++++++--------------------------- fs/open.c | 3 +-- fs/select.c | 10 ++++---- include/linux/file.h | 6 ----- include/linux/init_task.h | 1 - kernel/exit.c | 2 +- kernel/fork.c | 24 +++++-------------- security/selinux/hooks.c | 2 +- 13 files changed, 48 insertions(+), 81 deletions(-) (limited to 'kernel/exit.c') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index fb804043b320..be133f1f75a4 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -979,7 +979,7 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, long timeout; int ret = -EINVAL; struct fdtable *fdt; - int max_fdset; + int max_fds; timeout = MAX_SCHEDULE_TIMEOUT; if (tvp) { @@ -1003,9 +1003,9 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n < 0 || n > max_fdset) + if (n < 0 || n > max_fds) goto out_nofds; /* diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c index 2c82412b9efe..5929f883e46b 100644 --- a/arch/mips/kernel/kspd.c +++ b/arch/mips/kernel/kspd.c @@ -301,7 +301,7 @@ static void sp_cleanup(void) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { diff --git a/fs/compat.c b/fs/compat.c index b766964a625c..0ec70e3cee0a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1679,19 +1679,19 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp, { fd_set_bits fds; char *bits; - int size, max_fdset, ret = -EINVAL; + int size, max_fds, ret = -EINVAL; struct fdtable *fdt; if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/fs/exec.c b/fs/exec.c index 12d8cd461b41..11fe93f7363c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -783,7 +783,7 @@ static void flush_old_files(struct files_struct * files) j++; i = j * __NFDBITS; fdt = files_fdtable(files); - if (i >= fdt->max_fds || i >= fdt->max_fdset) + if (i >= fdt->max_fds) break; set = fdt->close_on_exec->fds_bits[j]; if (!set) diff --git a/fs/fcntl.c b/fs/fcntl.c index 2bdaef35da54..8e382a5d51bd 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -77,10 +77,9 @@ repeat: start = files->next_fd; newfd = start; - if (start < fdt->max_fdset) { + if (start < fdt->max_fds) newfd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, start); - } + fdt->max_fds, start); error = -EMFILE; if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) diff --git a/fs/file.c b/fs/file.c index 51aef675470f..fb3d2038dc21 100644 --- a/fs/file.c +++ b/fs/file.c @@ -68,8 +68,8 @@ void free_fd_array(struct file **array, int num) static void __free_fdtable(struct fdtable *fdt) { - free_fdset(fdt->open_fds, fdt->max_fdset); - free_fdset(fdt->close_on_exec, fdt->max_fdset); + free_fdset(fdt->open_fds, fdt->max_fds); + free_fdset(fdt->close_on_exec, fdt->max_fds); free_fd_array(fdt->fd, fdt->max_fds); kfree(fdt); } @@ -98,7 +98,7 @@ static void free_fdtable_rcu(struct rcu_head *rcu) struct fdtable_defer *fddef; BUG_ON(!fdt); - fdset_size = fdt->max_fdset / 8; + fdset_size = fdt->max_fds / 8; fdarray_size = fdt->max_fds * sizeof(struct file *); if (fdt->free_files) { @@ -110,13 +110,11 @@ static void free_fdtable_rcu(struct rcu_head *rcu) kmem_cache_free(files_cachep, fdt->free_files); return; } - if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE && - fdt->max_fds <= NR_OPEN_DEFAULT) { + if (fdt->max_fds <= NR_OPEN_DEFAULT) /* * The fdtable was embedded */ return; - } if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { kfree(fdt->open_fds); kfree(fdt->close_on_exec); @@ -136,9 +134,7 @@ static void free_fdtable_rcu(struct rcu_head *rcu) void free_fdtable(struct fdtable *fdt) { - if (fdt->free_files || - fdt->max_fdset > EMBEDDED_FD_SET_SIZE || - fdt->max_fds > NR_OPEN_DEFAULT) + if (fdt->free_files || fdt->max_fds > NR_OPEN_DEFAULT) call_rcu(&fdt->rcu, free_fdtable_rcu); } @@ -151,12 +147,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) int i; int count; - BUG_ON(nfdt->max_fdset < fdt->max_fdset); BUG_ON(nfdt->max_fds < fdt->max_fds); /* Copy the existing tables and install the new pointers */ - i = fdt->max_fdset / (sizeof(unsigned long) * 8); - count = (nfdt->max_fdset - fdt->max_fdset) / 8; + i = fdt->max_fds / (sizeof(unsigned long) * 8); + count = (nfdt->max_fds - fdt->max_fds) / 8; /* * Don't copy the entire array if the current fdset is @@ -164,9 +159,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) */ if (i) { memcpy (nfdt->open_fds, fdt->open_fds, - fdt->max_fdset/8); + fdt->max_fds/8); memcpy (nfdt->close_on_exec, fdt->close_on_exec, - fdt->max_fdset/8); + fdt->max_fds/8); memset (&nfdt->open_fds->fds_bits[i], 0, count); memset (&nfdt->close_on_exec->fds_bits[i], 0, count); } @@ -201,7 +196,7 @@ fd_set * alloc_fdset(int num) void free_fdset(fd_set *array, int num) { - if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */ + if (num <= NR_OPEN_DEFAULT) /* Don't free an embedded fdset */ return; else if (num <= 8 * PAGE_SIZE) kfree(array); @@ -220,18 +215,6 @@ static struct fdtable *alloc_fdtable(int nr) if (!fdt) goto out; - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); - if (nfds > NR_OPEN) - nfds = NR_OPEN; - - new_openset = alloc_fdset(nfds); - new_execset = alloc_fdset(nfds); - if (!new_openset || !new_execset) - goto out; - fdt->open_fds = new_openset; - fdt->close_on_exec = new_execset; - fdt->max_fdset = nfds; - nfds = NR_OPEN_DEFAULT; /* * Expand to the max in easy steps, and keep expanding it until @@ -251,15 +234,21 @@ static struct fdtable *alloc_fdtable(int nr) nfds = NR_OPEN; } } while (nfds <= nr); + + new_openset = alloc_fdset(nfds); + new_execset = alloc_fdset(nfds); + if (!new_openset || !new_execset) + goto out; + fdt->open_fds = new_openset; + fdt->close_on_exec = new_execset; + new_fds = alloc_fd_array(nfds); if (!new_fds) - goto out2; + goto out; fdt->fd = new_fds; fdt->max_fds = nfds; fdt->free_files = NULL; return fdt; -out2: - nfds = fdt->max_fdset; out: free_fdset(new_openset, nfds); free_fdset(new_execset, nfds); @@ -290,7 +279,7 @@ static int expand_fdtable(struct files_struct *files, int nr) * we dropped the lock */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) { + if (nr >= cur_fdt->max_fds) { /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); @@ -316,11 +305,10 @@ int expand_files(struct files_struct *files, int nr) fdt = files_fdtable(files); /* Do we need to expand? */ - if (nr < fdt->max_fdset && nr < fdt->max_fds) + if (nr < fdt->max_fds) return 0; /* Can we expand? */ - if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN || - nr >= NR_OPEN) + if (nr >= NR_OPEN) return -EMFILE; /* All good, so we try */ diff --git a/fs/open.c b/fs/open.c index 0d94319e8681..c989fb4cf7b9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -864,8 +864,7 @@ int get_unused_fd(void) repeat: fdt = files_fdtable(files); - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, + fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, files->next_fd); /* diff --git a/fs/select.c b/fs/select.c index dcbc1112b7ec..fe0893afd931 100644 --- a/fs/select.c +++ b/fs/select.c @@ -311,7 +311,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, { fd_set_bits fds; void *bits; - int ret, max_fdset; + int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ @@ -321,13 +321,13 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/include/linux/file.h b/include/linux/file.h index 6e77b9177f9e..02be4012225b 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -26,14 +26,8 @@ struct embedded_fd_set { unsigned long fds_bits[1]; }; -/* - * More than this number of fds: we use a separately allocated fd_set - */ -#define EMBEDDED_FD_SET_SIZE (BITS_PER_BYTE * sizeof(struct embedded_fd_set)) - struct fdtable { unsigned int max_fds; - int max_fdset; struct file ** fd; /* current fd array */ fd_set *close_on_exec; fd_set *open_fds; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7272ff9ee77c..58c18daab65d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -12,7 +12,6 @@ #define INIT_FDTABLE \ { \ .max_fds = NR_OPEN_DEFAULT, \ - .max_fdset = EMBEDDED_FD_SET_SIZE, \ .fd = &init_files.fd_array[0], \ .close_on_exec = (fd_set *)&init_files.close_on_exec_init, \ .open_fds = (fd_set *)&init_files.open_fds_init, \ diff --git a/kernel/exit.c b/kernel/exit.c index 03e64fe4a14a..5f77e76b4f97 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -425,7 +425,7 @@ static void close_files(struct files_struct * files) for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j++]; while (set) { diff --git a/kernel/fork.c b/kernel/fork.c index 30eab4f063cd..aba595424f78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -614,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fdset; + int size = fdt->max_fds; int i; /* Find the last open fd */ @@ -641,7 +641,6 @@ static struct files_struct *alloc_files(void) newf->next_fd = 0; fdt = &newf->fdtab; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = EMBEDDED_FD_SET_SIZE; fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; @@ -662,7 +661,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, expand; + int open_files, size, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -673,25 +672,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); - size = old_fdt->max_fdset; open_files = count_open_files(old_fdt); - expand = 0; /* - * Check whether we need to allocate a larger fd array or fd set. - * Note: we're not a clone task, so the open count won't change. + * Check whether we need to allocate a larger fd array and fd set. + * Note: we're not a clone task, so the open count won't change. */ - if (open_files > new_fdt->max_fdset) { - new_fdt->max_fdset = 0; - expand = 1; - } if (open_files > new_fdt->max_fds) { new_fdt->max_fds = 0; - expand = 1; - } - - /* if the old fdset gets grown now, we'll only copy up to "size" fds */ - if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); *errorp = expand_files(newf, open_files-1); @@ -739,8 +727,8 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (new_fdt->max_fdset > open_files) { - int left = (new_fdt->max_fdset-open_files)/8; + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&new_fdt->open_fds->fds_bits[start], 0, left); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3753416eb9b9..65fb5e8ea941 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1734,7 +1734,7 @@ static inline void flush_unauthorized_files(struct files_struct * files) j++; i = j * __NFDBITS; fdt = files_fdtable(files); - if (i >= fdt->max_fds || i >= fdt->max_fdset) + if (i >= fdt->max_fds) break; set = fdt->open_fds->fds_bits[j]; if (!set) -- cgit v1.2.3 From 4fd45812cbe875a620c86a096a5d46c742694b7e Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Sun, 10 Dec 2006 02:21:17 -0800 Subject: [PATCH] fdtable: Remove the free_files field An fdtable can either be embedded inside a files_struct or standalone (after being expanded). When an fdtable is being discarded after all RCU references to it have expired, we must either free it directly, in the standalone case, or free the files_struct it is contained within, in the embedded case. Currently the free_files field controls this behavior, but we can get rid of it entirely, as all the necessary information is already recorded. We can distinguish embedded and standalone fdtables using max_fds, and if it is embedded we can divine the relevant files_struct using container_of(). Signed-off-by: Vadim Lobanov Cc: Christoph Hellwig Cc: Al Viro Cc: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 27 ++++++++------------------- include/linux/file.h | 3 +-- include/linux/init_task.h | 1 - kernel/exit.c | 6 ++---- kernel/fork.c | 1 - 5 files changed, 11 insertions(+), 27 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/file.c b/fs/file.c index fb3d2038dc21..17e6a55521e2 100644 --- a/fs/file.c +++ b/fs/file.c @@ -91,7 +91,7 @@ static void free_fdtable_work(struct work_struct *work) } } -static void free_fdtable_rcu(struct rcu_head *rcu) +void free_fdtable_rcu(struct rcu_head *rcu) { struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); int fdset_size, fdarray_size; @@ -101,20 +101,15 @@ static void free_fdtable_rcu(struct rcu_head *rcu) fdset_size = fdt->max_fds / 8; fdarray_size = fdt->max_fds * sizeof(struct file *); - if (fdt->free_files) { + if (fdt->max_fds <= NR_OPEN_DEFAULT) { /* - * The this fdtable was embedded in the files structure - * and the files structure itself was getting destroyed. - * It is now safe to free the files structure. + * This fdtable is embedded in the files structure and that + * structure itself is getting destroyed. */ - kmem_cache_free(files_cachep, fdt->free_files); + kmem_cache_free(files_cachep, + container_of(fdt, struct files_struct, fdtab)); return; } - if (fdt->max_fds <= NR_OPEN_DEFAULT) - /* - * The fdtable was embedded - */ - return; if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { kfree(fdt->open_fds); kfree(fdt->close_on_exec); @@ -132,12 +127,6 @@ static void free_fdtable_rcu(struct rcu_head *rcu) } } -void free_fdtable(struct fdtable *fdt) -{ - if (fdt->free_files || fdt->max_fds > NR_OPEN_DEFAULT) - call_rcu(&fdt->rcu, free_fdtable_rcu); -} - /* * Expand the fdset in the files_struct. Called with the files spinlock * held for write. @@ -247,7 +236,6 @@ static struct fdtable *alloc_fdtable(int nr) goto out; fdt->fd = new_fds; fdt->max_fds = nfds; - fdt->free_files = NULL; return fdt; out: free_fdset(new_openset, nfds); @@ -283,7 +271,8 @@ static int expand_fdtable(struct files_struct *files, int nr) /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - free_fdtable(cur_fdt); + if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ __free_fdtable(new_fdt); diff --git a/include/linux/file.h b/include/linux/file.h index 02be4012225b..319118f275b0 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -32,7 +32,6 @@ struct fdtable { fd_set *close_on_exec; fd_set *open_fds; struct rcu_head rcu; - struct files_struct *free_files; struct fdtable *next; }; @@ -84,7 +83,7 @@ extern fd_set *alloc_fdset(int); extern void free_fdset(fd_set *, int); extern int expand_files(struct files_struct *, int nr); -extern void free_fdtable(struct fdtable *fdt); +extern void free_fdtable_rcu(struct rcu_head *rcu); extern void __init files_defer_init(void); static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 58c18daab65d..b5315150199e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -16,7 +16,6 @@ .close_on_exec = (fd_set *)&init_files.close_on_exec_init, \ .open_fds = (fd_set *)&init_files.open_fds_init, \ .rcu = RCU_HEAD_INIT, \ - .free_files = NULL, \ .next = NULL, \ } diff --git a/kernel/exit.c b/kernel/exit.c index 5f77e76b4f97..122fadb972fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -466,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files) * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt == &files->fdtab) - fdt->free_files = files; - else + if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); - free_fdtable(fdt); + call_rcu(&fdt->rcu, free_fdtable_rcu); } } diff --git a/kernel/fork.c b/kernel/fork.c index aba595424f78..d16c566eb645 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -645,7 +645,6 @@ static struct files_struct *alloc_files(void) fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); - fdt->free_files = NULL; fdt->next = NULL; rcu_assign_pointer(newf->fdt, fdt); out: -- cgit v1.2.3 From 01b2d93ca4c495f056471189ac6c4e6ac4cbbccb Mon Sep 17 00:00:00 2001 From: Vadim Lobanov Date: Fri, 22 Dec 2006 01:10:43 -0800 Subject: [PATCH] fdtable: Provide free_fdtable() wrapper Christoph Hellwig has expressed concerns that the recent fdtable changes expose the details of the RCU methodology used to release no-longer-used fdtable structures to the rest of the kernel. The trivial patch below addresses these concerns by introducing the appropriate free_fdtable() calls, which simply wrap the release RCU usage. Since free_fdtable() is a one-liner, it makes sense to promote it to an inline helper. Signed-off-by: Vadim Lobanov Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 2 +- include/linux/file.h | 5 +++++ kernel/exit.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/file.c b/fs/file.c index 857fa49e984c..c5575de01113 100644 --- a/fs/file.c +++ b/fs/file.c @@ -206,7 +206,7 @@ static int expand_fdtable(struct files_struct *files, int nr) copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt->max_fds > NR_OPEN_DEFAULT) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + free_fdtable(cur_fdt); } else { /* Somebody else expanded, so undo our attempt */ free_fdarr(new_fdt); diff --git a/include/linux/file.h b/include/linux/file.h index edca361f2ab4..a59001e9ea58 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -80,6 +80,11 @@ extern int expand_files(struct files_struct *, int nr); extern void free_fdtable_rcu(struct rcu_head *rcu); extern void __init files_defer_init(void); +static inline void free_fdtable(struct fdtable *fdt) +{ + call_rcu(&fdt->rcu, free_fdtable_rcu); +} + static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) { struct file * file = NULL; diff --git a/kernel/exit.c b/kernel/exit.c index 122fadb972fc..85917c2bf065 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -468,7 +468,7 @@ void fastcall put_files_struct(struct files_struct *files) fdt = files_fdtable(files); if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); - call_rcu(&fdt->rcu, free_fdtable_rcu); + free_fdtable(fdt); } } -- cgit v1.2.3 From b2b2cbc4b2a2f389442549399a993a8306420baf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 21 Dec 2006 21:28:40 -0700 Subject: [PATCH] Fix reparenting to the same thread group. (take 2) This patch fixes the case when we reparent to a different thread in the same thread group. This modifies the code so that we do not send signals and do not change the signal to send to SIGCHLD unless we have change the thread group of our parents. It also suppresses sending pdeath_sig in this cas as well since the result of geppid doesn't change. Thanks to Oleg for spotting my bug of only fixing this for non-ptraced tasks. Signed-off-by: Eric W. Biederman Cc: Mike Galbraith Cc: Albert Cahalan Cc: Andrew Morton Cc: Roland McGrath Cc: Ingo Molnar Cc: Coywolf Qi Hunt Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 85917c2bf065..46cf6b681460 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -597,14 +597,6 @@ choose_new_parent(struct task_struct *p, struct task_struct *reaper) static void reparent_thread(struct task_struct *p, struct task_struct *father, int traced) { - /* We don't want people slaying init. */ - if (p->exit_signal != -1) - p->exit_signal = SIGCHLD; - - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { /* Preserve ptrace links if someone else is tracing this child. */ @@ -620,13 +612,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->parent = p->real_parent; add_parent(p); - /* If we'd notified the old parent about this child's death, - * also notify the new parent. - */ - if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && - thread_group_empty(p)) - do_notify_parent(p, p->exit_signal); - else if (p->state == TASK_TRACED) { + if (p->state == TASK_TRACED) { /* * If it was at a trace stop, turn it into * a normal stop since it's no longer being @@ -636,6 +622,27 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) } } + /* If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (p->real_parent->group_leader == father->group_leader) + return; + + /* We don't want people slaying init. */ + if (p->exit_signal != -1) + p->exit_signal = SIGCHLD; + + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + + /* If we'd notified the old parent about this child's death, + * also notify the new parent. + */ + if (!traced && p->exit_state == EXIT_ZOMBIE && + p->exit_signal != -1 && thread_group_empty(p)) + do_notify_parent(p, p->exit_signal); + /* * process group orphan check * Case ii: Our child is in a different pgrp -- cgit v1.2.3 From 241ceee0b442c69226fb882d61d9b9785743898f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 24 Dec 2006 23:30:44 +0300 Subject: [PATCH] restore ->pdeath_signal behaviour Commit b2b2cbc4b2a2f389442549399a993a8306420baf introduced a user- visible change: ->pdeath_signal is sent only when the entire thread group exits. While this change is imho good, it may break things. So restore the old behaviour for now. Signed-off-by: Oleg Nesterov To: Albert Cahalan Cc: Eric W. Biederman Cc: Andrew Morton Cc: Linus Torvalds Cc: Ingo Molnar Cc: Qi Yong Cc: Roland McGrath Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 46cf6b681460..35401720635b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -597,6 +597,10 @@ choose_new_parent(struct task_struct *p, struct task_struct *reaper) static void reparent_thread(struct task_struct *p, struct task_struct *father, int traced) { + if (p->pdeath_signal) + /* We already hold the tasklist_lock here. */ + group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); + /* Move the child from its dying parent to the new one. */ if (unlikely(traced)) { /* Preserve ptrace links if someone else is tracing this child. */ @@ -631,10 +635,6 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) /* We don't want people slaying init. */ if (p->exit_signal != -1) p->exit_signal = SIGCHLD; - - if (p->pdeath_signal) - /* We already hold the tasklist_lock here. */ - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); /* If we'd notified the old parent about this child's death, * also notify the new parent. -- cgit v1.2.3