summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
commit86579dd06deecfa6ac88d5e84e4d63c397cd6f6d (patch)
treeb4475d3ccde53015ad84a06e4e55e64591171b75 /kernel
parent7ea9ea832212c4a755650f7c7cc1ff0b63292a41 (diff)
parenta0f067802576d4eb4c65d40b8ee7d6ea3c81dd61 (diff)
downloadlwn-86579dd06deecfa6ac88d5e84e4d63c397cd6f6d.tar.gz
lwn-86579dd06deecfa6ac88d5e84e4d63c397cd6f6d.zip
Merge branch 'master'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c175
-rw-r--r--kernel/audit.h88
-rw-r--r--kernel/auditfilter.c630
-rw-r--r--kernel/auditsc.c836
-rw-r--r--kernel/capability.c16
-rw-r--r--kernel/compat.c82
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cpuset.c364
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c148
-rw-r--r--kernel/fork.c147
-rw-r--r--kernel/futex.c170
-rw-r--r--kernel/futex_compat.c142
-rw-r--r--kernel/hrtimer.c193
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/manage.c24
-rw-r--r--kernel/irq/migration.c65
-rw-r--r--kernel/itimer.c117
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/ksysfs.c7
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/module.c460
-rw-r--r--kernel/panic.c101
-rw-r--r--kernel/params.c24
-rw-r--r--kernel/pid.c40
-rw-r--r--kernel/posix-timers.c68
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c20
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/pm.c21
-rw-r--r--kernel/power/power.h75
-rw-r--r--kernel/power/process.c61
-rw-r--r--kernel/power/smp.c4
-rw-r--r--kernel/power/snapshot.c335
-rw-r--r--kernel/power/swap.c545
-rw-r--r--kernel/power/swsusp.c887
-rw-r--r--kernel/power/user.c333
-rw-r--r--kernel/printk.c76
-rw-r--r--kernel/profile.c64
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcupdate.c25
-rw-r--r--kernel/rcutorture.c37
-rw-r--r--kernel/relay.c1012
-rw-r--r--kernel/sched.c180
-rw-r--r--kernel/signal.c355
-rw-r--r--kernel/softirq.c20
-rw-r--r--kernel/softlockup.c57
-rw-r--r--kernel/spinlock.c9
-rw-r--r--kernel/sys.c514
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/time.c63
-rw-r--r--kernel/timer.c74
-rw-r--r--kernel/user.c10
-rw-r--r--kernel/workqueue.c29
57 files changed, 5940 insertions, 2875 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..58908f9d156a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a813d2883e5..04fe2e301b61 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,6 +52,7 @@
#include <linux/audit.h>
#include <net/sock.h>
+#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
@@ -72,7 +73,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
* contains the (non-zero) pid. */
int audit_pid;
-/* If audit_limit is non-zero, limit the rate of sending audit records
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
static int audit_rate_limit;
@@ -102,7 +103,7 @@ static struct sock *audit_sock;
* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
* being placed on the freelist). */
static DEFINE_SPINLOCK(audit_freelist_lock);
-static int audit_freelist_count = 0;
+static int audit_freelist_count;
static LIST_HEAD(audit_freelist);
static struct sk_buff_head audit_skb_queue;
@@ -113,7 +114,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
/* The netlink socket is only to be read by 1 CPU, which lets us assume
* that list additions and deletions never happen simultaneously in
* auditsc.c */
-DECLARE_MUTEX(audit_netlink_sem);
+DEFINE_MUTEX(audit_netlink_mutex);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -142,7 +143,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
nlh->nlmsg_pid = pid;
}
-static void audit_panic(const char *message)
+void audit_panic(const char *message)
{
switch (audit_failure)
{
@@ -186,8 +187,14 @@ static inline int audit_rate_check(void)
return retval;
}
-/* Emit at least 1 message per second, even if audit_rate_check is
- * throttling. */
+/**
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
void audit_log_lost(const char *message)
{
static unsigned long last_msg = 0;
@@ -218,7 +225,6 @@ void audit_log_lost(const char *message)
audit_backlog_limit);
audit_panic(message);
}
-
}
static int audit_set_rate_limit(int limit, uid_t loginuid)
@@ -300,8 +306,22 @@ static int kauditd_thread(void *dummy)
remove_wait_queue(&kauditd_wait, &wait);
}
}
+ return 0;
}
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @pid: process id to send reply to
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * No failure notifications.
+ */
void audit_send_reply(int pid, int seq, int type, int done, int multi,
void *payload, int size)
{
@@ -342,15 +362,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
switch (msg_type) {
case AUDIT_GET:
case AUDIT_LIST:
+ case AUDIT_LIST_RULES:
case AUDIT_SET:
case AUDIT_ADD:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -376,7 +400,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
- /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+ /* As soon as there's any sign of userspace auditd,
+ * start kauditd to talk to it */
if (!kauditd_task)
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
@@ -430,6 +455,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
@@ -448,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_ADD:
case AUDIT_DEL:
- if (nlh->nlmsg_len < sizeof(struct audit_rule))
+ if (nlmsg_len(nlh) < sizeof(struct audit_rule))
return -EINVAL;
/* fallthrough */
case AUDIT_LIST:
err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
- uid, seq, data, loginuid);
+ uid, seq, data, nlmsg_len(nlh),
+ loginuid);
+ break;
+ case AUDIT_ADD_RULE:
+ case AUDIT_DEL_RULE:
+ if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ return -EINVAL;
+ /* fallthrough */
+ case AUDIT_LIST_RULES:
+ err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ uid, seq, data, nlmsg_len(nlh),
+ loginuid);
break;
case AUDIT_SIGNAL_INFO:
sig_data.uid = audit_sig_uid;
@@ -469,9 +506,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err < 0 ? err : 0;
}
-/* Get message from skb (based on rtnetlink_rcv_skb). Each message is
+/*
+ * Get message from skb (based on rtnetlink_rcv_skb). Each message is
* processed by audit_receive_msg. Malformed skbs with wrong length are
- * discarded silently. */
+ * discarded silently.
+ */
static void audit_receive_skb(struct sk_buff *skb)
{
int err;
@@ -499,14 +538,14 @@ static void audit_receive(struct sock *sk, int length)
struct sk_buff *skb;
unsigned int qlen;
- down(&audit_netlink_sem);
+ mutex_lock(&audit_netlink_mutex);
for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
skb = skb_dequeue(&sk->sk_receive_queue);
audit_receive_skb(skb);
kfree_skb(skb);
}
- up(&audit_netlink_sem);
+ mutex_unlock(&audit_netlink_mutex);
}
@@ -519,8 +558,9 @@ static int __init audit_init(void)
THIS_MODULE);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
+ else
+ audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
skb_queue_head_init(&audit_skb_queue);
audit_initialized = 1;
audit_enabled = audit_default;
@@ -600,7 +640,10 @@ err:
return NULL;
}
-/* Compute a serial number for the audit record. Audit records are
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record. Audit records are
* written to user-space as soon as they are generated, so a complete
* audit record may be written in several pieces. The timestamp of the
* record and this serial number are used by the user-space tools to
@@ -612,8 +655,8 @@ err:
* audit context (for those records that have a context), and emit them
* all at syscall exit. However, this could delay the reporting of
* significant errors until syscall exit (or never, if the system
- * halts). */
-
+ * halts).
+ */
unsigned int audit_serial(void)
{
static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
@@ -649,6 +692,21 @@ static inline void audit_get_stamp(struct audit_context *ctx,
* will be written at syscall exit. If there is no associated task, tsk
* should be NULL. */
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
@@ -661,6 +719,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (!audit_initialized)
return NULL;
+ if (unlikely(audit_filter_type(type)))
+ return NULL;
+
if (gfp_mask & __GFP_WAIT)
reserve = 0;
else
@@ -713,6 +774,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/**
* audit_expand - expand skb in the audit buffer
* @ab: audit_buffer
+ * @extra: space to add at tail of the skb
*
* Returns 0 (no space) on failed expansion, or available space if
* successful.
@@ -729,10 +791,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
return skb_tailroom(skb);
}
-/* Format an audit message into the audit buffer. If there isn't enough
+/*
+ * Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
* will be called a second time. Currently, we assume that a printk
- * can't format message larger than 1024 bytes, so we don't either. */
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
@@ -757,7 +821,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
- avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ avail = audit_expand(ab,
+ max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
goto out;
len = vsnprintf(skb->tail, avail, fmt, args2);
@@ -768,8 +833,14 @@ out:
return;
}
-/* Format a message into the audit buffer. All the work is done in
- * audit_log_vformat. */
+/**
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
va_list args;
@@ -781,9 +852,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
va_end(args);
}
-/* This function will take the passed buf and convert it into a string of
- * ascii hex digits. The new string is placed onto the skb. */
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+/**
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
size_t len)
{
int i, avail, new_len;
@@ -812,10 +892,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
skb_put(skb, len << 1); /* new string is twice the old string */
}
-/* This code will escape a string that is passed to it if the string
- * contains a control character, unprintable character, double quote mark,
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
* or a space. Unescaped strings will start and end with a double quote mark.
- * Strings that are escaped are printed in hex (2 digits per char). */
+ * Strings that are escaped are printed in hex (2 digits per char).
+ */
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
const unsigned char *p = string;
@@ -854,10 +940,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(path);
}
-/* The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is places on a queue and a tasklet is scheduled to
+/**
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is placed on a queue and a tasklet is scheduled to
* remove them from the queue outside the irq context. May be called in
- * any context. */
+ * any context.
+ */
void audit_log_end(struct audit_buffer *ab)
{
if (!ab)
@@ -878,9 +969,18 @@ void audit_log_end(struct audit_buffer *ab)
audit_buffer_free(ab);
}
-/* Log an audit record. This is a convenience function that calls
- * audit_log_start, audit_log_vformat, and audit_log_end. It may be
- * called in any context. */
+/**
+ * audit_log - Log an audit record
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end. It may be called
+ * in any context.
+ */
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
const char *fmt, ...)
{
@@ -895,3 +995,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
audit_log_end(ab);
}
}
+
+EXPORT_SYMBOL(audit_log_start);
+EXPORT_SYMBOL(audit_log_end);
+EXPORT_SYMBOL(audit_log_format);
+EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 000000000000..bc5392076e2b
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,88 @@
+/* audit -- definition of audit_context structure and supporting types
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/audit.h>
+
+/* 0 = no checking
+ 1 = put_count checking
+ 2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* At task start time, the audit_state is set in the audit_context using
+ a per-task filter. At syscall entry, the audit_state is augmented by
+ the syscall filter. */
+enum audit_state {
+ AUDIT_DISABLED, /* Do not create per-task audit_context.
+ * No syscall-specific audit records can
+ * be generated. */
+ AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
+ * but don't necessarily fill it in at
+ * syscall entry time (i.e., filter
+ * instead). */
+ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ * and always fill it in at syscall
+ * entry time. This makes a full
+ * syscall record available if some
+ * other part of the kernel decides it
+ * should be recorded. */
+ AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ * always fill it in at syscall entry
+ * time, and always write out the audit
+ * record at syscall exit time. */
+};
+
+/* Rule lists */
+struct audit_field {
+ u32 type;
+ u32 val;
+ u32 op;
+};
+
+struct audit_krule {
+ int vers_ops;
+ u32 flags;
+ u32 listnr;
+ u32 action;
+ u32 mask[AUDIT_BITMASK_SIZE];
+ u32 buflen; /* for data alloc on list rules */
+ u32 field_count;
+ struct audit_field *fields;
+};
+
+struct audit_entry {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct audit_krule rule;
+};
+
+
+extern int audit_pid;
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+
+extern void audit_send_reply(int pid, int seq, int type,
+ int done, int multi,
+ void *payload, int size);
+extern void audit_log_lost(const char *message);
+extern void audit_panic(const char *message);
+extern struct mutex audit_netlink_mutex;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 000000000000..d3a8539f3a83
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,630 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include "audit.h"
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+ LIST_HEAD_INIT(audit_filter_list[0]),
+ LIST_HEAD_INIT(audit_filter_list[1]),
+ LIST_HEAD_INIT(audit_filter_list[2]),
+ LIST_HEAD_INIT(audit_filter_list[3]),
+ LIST_HEAD_INIT(audit_filter_list[4]),
+ LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+
+static inline void audit_free_rule(struct audit_entry *e)
+{
+ kfree(e->rule.fields);
+ kfree(e);
+}
+
+static inline void audit_free_rule_rcu(struct rcu_head *head)
+{
+ struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+ audit_free_rule(e);
+}
+
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+ char *str;
+
+ if (!*bufp || (len == 0) || (len > *remain))
+ return ERR_PTR(-EINVAL);
+
+ /* Of the currently implemented string fields, PATH_MAX
+ * defines the longest valid length.
+ */
+ if (len > PATH_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ str = kmalloc(len + 1, GFP_KERNEL);
+ if (unlikely(!str))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(str, *bufp, len);
+ str[len] = 0;
+ *bufp += len;
+ *remain -= len;
+
+ return str;
+}
+
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+{
+ unsigned listnr;
+ struct audit_entry *entry;
+ struct audit_field *fields;
+ int i, err;
+
+ err = -EINVAL;
+ listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+ switch(listnr) {
+ default:
+ goto exit_err;
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+ case AUDIT_FILTER_ENTRY:
+ case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_TASK:
+#endif
+ ;
+ }
+ if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
+ rule->action != AUDIT_ALWAYS)
+ goto exit_err;
+ if (rule->field_count > AUDIT_MAX_FIELDS)
+ goto exit_err;
+
+ err = -ENOMEM;
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (unlikely(!entry))
+ goto exit_err;
+ fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
+ if (unlikely(!fields)) {
+ kfree(entry);
+ goto exit_err;
+ }
+
+ memset(&entry->rule, 0, sizeof(struct audit_krule));
+ memset(fields, 0, sizeof(struct audit_field));
+
+ entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+ entry->rule.listnr = listnr;
+ entry->rule.action = rule->action;
+ entry->rule.field_count = rule->field_count;
+ entry->rule.fields = fields;
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ entry->rule.mask[i] = rule->mask[i];
+
+ return entry;
+
+exit_err:
+ return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule to kernel's rule respresentation.
+ * Exists for backward compatibility with userspace. */
+static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+{
+ struct audit_entry *entry;
+ int err = 0;
+ int i;
+
+ entry = audit_to_entry_common(rule);
+ if (IS_ERR(entry))
+ goto exit_nofree;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &entry->rule.fields[i];
+
+ if (rule->fields[i] & AUDIT_UNUSED_BITS) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+
+ f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+ f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
+ f->val = rule->values[i];
+
+ entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+
+ /* Support for legacy operators where
+ * AUDIT_NEGATE bit signifies != and otherwise assumes == */
+ if (f->op & AUDIT_NEGATE)
+ f->op = AUDIT_NOT_EQUAL;
+ else if (!f->op)
+ f->op = AUDIT_EQUAL;
+ else if (f->op == AUDIT_OPERATORS) {
+ err = -EINVAL;
+ goto exit_free;
+ }
+ }
+
+exit_nofree:
+ return entry;
+
+exit_free:
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+ size_t datasz)
+{
+ int err = 0;
+ struct audit_entry *entry;
+ void *bufp;
+ /* size_t remain = datasz - sizeof(struct audit_rule_data); */
+ int i;
+
+ entry = audit_to_entry_common((struct audit_rule *)data);
+ if (IS_ERR(entry))
+ goto exit_nofree;
+
+ bufp = data->buf;
+ entry->rule.vers_ops = 2;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &entry->rule.fields[i];
+
+ err = -EINVAL;
+ if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
+ data->fieldflags[i] & ~AUDIT_OPERATORS)
+ goto exit_free;
+
+ f->op = data->fieldflags[i] & AUDIT_OPERATORS;
+ f->type = data->fields[i];
+ switch(f->type) {
+ /* call type-specific conversion routines here */
+ default:
+ f->val = data->values[i];
+ }
+ }
+
+exit_nofree:
+ return entry;
+
+exit_free:
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+}
+
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, char *str)
+{
+ size_t len = strlen(str);
+
+ memcpy(*bufp, str, len);
+ *bufp += len;
+
+ return len;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule.
+ * Exists for backward compatibility with userspace. */
+static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
+{
+ struct audit_rule *rule;
+ int i;
+
+ rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+ if (unlikely(!rule))
+ return ERR_PTR(-ENOMEM);
+ memset(rule, 0, sizeof(*rule));
+
+ rule->flags = krule->flags | krule->listnr;
+ rule->action = krule->action;
+ rule->field_count = krule->field_count;
+ for (i = 0; i < rule->field_count; i++) {
+ rule->values[i] = krule->fields[i].val;
+ rule->fields[i] = krule->fields[i].type;
+
+ if (krule->vers_ops == 1) {
+ if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+ rule->fields[i] |= AUDIT_NEGATE;
+ } else {
+ rule->fields[i] |= krule->fields[i].op;
+ }
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
+
+ return rule;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+ struct audit_rule_data *data;
+ void *bufp;
+ int i;
+
+ data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ if (unlikely(!data))
+ return ERR_PTR(-ENOMEM);
+ memset(data, 0, sizeof(*data));
+
+ data->flags = krule->flags | krule->listnr;
+ data->action = krule->action;
+ data->field_count = krule->field_count;
+ bufp = data->buf;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &krule->fields[i];
+
+ data->fields[i] = f->type;
+ data->fieldflags[i] = f->op;
+ switch(f->type) {
+ /* call type-specific conversion routines here */
+ default:
+ data->values[i] = f->val;
+ }
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+
+ return data;
+}
+
+/* Compare two rules in kernel format. Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+ int i;
+
+ if (a->flags != b->flags ||
+ a->listnr != b->listnr ||
+ a->action != b->action ||
+ a->field_count != b->field_count)
+ return 1;
+
+ for (i = 0; i < a->field_count; i++) {
+ if (a->fields[i].type != b->fields[i].type ||
+ a->fields[i].op != b->fields[i].op)
+ return 1;
+
+ switch(a->fields[i].type) {
+ /* call type-specific comparison routines here */
+ default:
+ if (a->fields[i].val != b->fields[i].val)
+ return 1;
+ }
+ }
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (a->mask[i] != b->mask[i])
+ return 1;
+
+ return 0;
+}
+
+/* Add rule to given filterlist if not a duplicate. Protected by
+ * audit_netlink_mutex. */
+static inline int audit_add_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * addition routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(&entry->rule, &e->rule))
+ return -EEXIST;
+ }
+
+ if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+ list_add_rcu(&entry->list, list);
+ } else {
+ list_add_tail_rcu(&entry->list, list);
+ }
+
+ return 0;
+}
+
+/* Remove an existing rule from filterlist. Protected by
+ * audit_netlink_mutex. */
+static inline int audit_del_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * deletion routine. */
+ list_for_each_entry(e, list, list) {
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ return 0;
+ }
+ }
+ return -ENOENT; /* No matching rule */
+}
+
+/* List rules using struct audit_rule. Exists for backward
+ * compatibility with userspace. */
+static int audit_list(void *_dest)
+{
+ int pid, seq;
+ int *dest = _dest;
+ struct audit_entry *entry;
+ int i;
+
+ pid = dest[0];
+ seq = dest[1];
+ kfree(dest);
+
+ mutex_lock(&audit_netlink_mutex);
+
+ /* The *_rcu iterators not needed here because we are
+ always called with audit_netlink_mutex held. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(entry, &audit_filter_list[i], list) {
+ struct audit_rule *rule;
+
+ rule = audit_krule_to_rule(&entry->rule);
+ if (unlikely(!rule))
+ break;
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ rule, sizeof(*rule));
+ kfree(rule);
+ }
+ }
+ audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+
+ mutex_unlock(&audit_netlink_mutex);
+ return 0;
+}
+
+/* List rules using struct audit_rule_data. */
+static int audit_list_rules(void *_dest)
+{
+ int pid, seq;
+ int *dest = _dest;
+ struct audit_entry *e;
+ int i;
+
+ pid = dest[0];
+ seq = dest[1];
+ kfree(dest);
+
+ mutex_lock(&audit_netlink_mutex);
+
+ /* The *_rcu iterators not needed here because we are
+ always called with audit_netlink_mutex held. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(e, &audit_filter_list[i], list) {
+ struct audit_rule_data *data;
+
+ data = audit_krule_to_data(&e->rule);
+ if (unlikely(!data))
+ break;
+ audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+ data, sizeof(*data));
+ kfree(data);
+ }
+ }
+ audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+
+ mutex_unlock(&audit_netlink_mutex);
+ return 0;
+}
+
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @datasz: size of payload data
+ * @loginuid: loginuid of sender
+ */
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+ size_t datasz, uid_t loginuid)
+{
+ struct task_struct *tsk;
+ int *dest;
+ int err = 0;
+ struct audit_entry *entry;
+
+ switch (type) {
+ case AUDIT_LIST:
+ case AUDIT_LIST_RULES:
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest[0] = pid;
+ dest[1] = seq;
+
+ if (type == AUDIT_LIST)
+ tsk = kthread_run(audit_list, dest, "audit_list");
+ else
+ tsk = kthread_run(audit_list_rules, dest,
+ "audit_list_rules");
+ if (IS_ERR(tsk)) {
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+ break;
+ case AUDIT_ADD:
+ case AUDIT_ADD_RULE:
+ if (type == AUDIT_ADD)
+ entry = audit_rule_to_entry(data);
+ else
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ err = audit_add_rule(entry,
+ &audit_filter_list[entry->rule.listnr]);
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+ "auid=%u add rule to list=%d res=%d\n",
+ loginuid, entry->rule.listnr, !err);
+
+ if (err)
+ audit_free_rule(entry);
+ break;
+ case AUDIT_DEL:
+ case AUDIT_DEL_RULE:
+ if (type == AUDIT_DEL)
+ entry = audit_rule_to_entry(data);
+ else
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ err = audit_del_rule(entry,
+ &audit_filter_list[entry->rule.listnr]);
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+ "auid=%u remove rule from list=%d res=%d\n",
+ loginuid, entry->rule.listnr, !err);
+
+ audit_free_rule(entry);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return err;
+}
+
+int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+ switch (op) {
+ case AUDIT_EQUAL:
+ return (left == right);
+ case AUDIT_NOT_EQUAL:
+ return (left != right);
+ case AUDIT_LESS_THAN:
+ return (left < right);
+ case AUDIT_LESS_THAN_OR_EQUAL:
+ return (left <= right);
+ case AUDIT_GREATER_THAN:
+ return (left > right);
+ case AUDIT_GREATER_THAN_OR_EQUAL:
+ return (left >= right);
+ }
+ BUG();
+ return 0;
+}
+
+
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+ struct audit_krule *rule,
+ enum audit_state *state)
+{
+ int i;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &rule->fields[i];
+ int result = 0;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ result = audit_comparator(cb->creds.pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_comparator(cb->creds.uid, f->op, f->val);
+ break;
+ case AUDIT_GID:
+ result = audit_comparator(cb->creds.gid, f->op, f->val);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_comparator(cb->loginuid, f->op, f->val);
+ break;
+ }
+
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+ int ret = 1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+ if (audit_filter_user_rules(cb, &e->rule, &state)) {
+ if (state == AUDIT_DISABLED)
+ ret = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret; /* Audit by default */
+}
+
+int audit_filter_type(int type)
+{
+ struct audit_entry *e;
+ int result = 0;
+
+ rcu_read_lock();
+ if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+ goto unlock_and_return;
+
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+ list) {
+ int i;
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+ if (f->type == AUDIT_MSGTYPE) {
+ result = audit_comparator(type, f->op, f->val);
+ if (!result)
+ break;
+ }
+ }
+ if (result)
+ goto unlock_and_return;
+ }
+unlock_and_return:
+ rcu_read_unlock();
+ return result;
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7e7e637b92a..7f160df21a23 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,8 @@
* Handles all system-call specific auditing features.
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005 IBM Corporation
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
@@ -27,11 +29,22 @@
* this file -- see entry.S) is based on a GPL'd patch written by
* okir@suse.de and Copyright 2003 SuSE Linux AG.
*
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
#include <linux/init.h>
#include <asm/types.h>
#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mount.h>
@@ -39,16 +52,16 @@
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
-#include <linux/kthread.h>
#include <linux/netlink.h>
#include <linux/compiler.h>
#include <asm/unistd.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/tty.h>
+
+#include "audit.h"
-/* 0 = no checking
- 1 = put_count checking
- 2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
+extern struct list_head audit_filter_list[];
/* No syscall auditing will take place unless audit_enabled != 0. */
extern int audit_enabled;
@@ -62,29 +75,6 @@ extern int audit_enabled;
* path_lookup. */
#define AUDIT_NAMES_RESERVED 7
-/* At task start time, the audit_state is set in the audit_context using
- a per-task filter. At syscall entry, the audit_state is augmented by
- the syscall filter. */
-enum audit_state {
- AUDIT_DISABLED, /* Do not create per-task audit_context.
- * No syscall-specific audit records can
- * be generated. */
- AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
- * but don't necessarily fill it in at
- * syscall entry time (i.e., filter
- * instead). */
- AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
- * and always fill it in at syscall
- * entry time. This makes a full
- * syscall record available if some
- * other part of the kernel decides it
- * should be recorded. */
- AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
- * always fill it in at syscall entry
- * time, and always write out the audit
- * record at syscall exit time. */
-};
-
/* When fs/namei.c:getname() is called, we store the pointer in name and
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
@@ -93,12 +83,13 @@ enum audit_state {
struct audit_names {
const char *name;
unsigned long ino;
+ unsigned long pino;
dev_t dev;
umode_t mode;
uid_t uid;
gid_t gid;
dev_t rdev;
- unsigned flags;
+ char *ctx;
};
struct audit_aux_data {
@@ -115,6 +106,7 @@ struct audit_aux_data_ipcctl {
uid_t uid;
gid_t gid;
mode_t mode;
+ char *ctx;
};
struct audit_aux_data_socketcall {
@@ -167,290 +159,72 @@ struct audit_context {
#endif
};
- /* Public API */
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
- LIST_HEAD_INIT(audit_filter_list[0]),
- LIST_HEAD_INIT(audit_filter_list[1]),
- LIST_HEAD_INIT(audit_filter_list[2]),
- LIST_HEAD_INIT(audit_filter_list[3]),
- LIST_HEAD_INIT(audit_filter_list[4]),
-#if AUDIT_NR_FILTERS != 5
-#error Fix audit_filter_list initialiser
-#endif
-};
-
-struct audit_entry {
- struct list_head list;
- struct rcu_head rcu;
- struct audit_rule rule;
-};
-
-extern int audit_pid;
-
-/* Copy rule from user-space to kernel-space. Called from
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
-{
- int i;
-
- if (s->action != AUDIT_NEVER
- && s->action != AUDIT_POSSIBLE
- && s->action != AUDIT_ALWAYS)
- return -1;
- if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
- return -1;
- if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
- return -1;
-
- d->flags = s->flags;
- d->action = s->action;
- d->field_count = s->field_count;
- for (i = 0; i < d->field_count; i++) {
- d->fields[i] = s->fields[i];
- d->values[i] = s->values[i];
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
- return 0;
-}
-
-/* Check to see if two rules are identical. It is called from
- * audit_add_rule during AUDIT_ADD and
- * audit_del_rule during AUDIT_DEL. */
-static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
-{
- int i;
-
- if (a->flags != b->flags)
- return 1;
-
- if (a->action != b->action)
- return 1;
-
- if (a->field_count != b->field_count)
- return 1;
-
- for (i = 0; i < a->field_count; i++) {
- if (a->fields[i] != b->fields[i]
- || a->values[i] != b->values[i])
- return 1;
- }
-
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
- if (a->mask[i] != b->mask[i])
- return 1;
-
- return 0;
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
- struct list_head *list)
-{
- struct audit_entry *entry;
-
- /* Do not use the _rcu iterator here, since this is the only
- * addition routine. */
- list_for_each_entry(entry, list, list) {
- if (!audit_compare_rule(rule, &entry->rule)) {
- return -EEXIST;
- }
- }
-
- if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
- return -ENOMEM;
- if (audit_copy_rule(&entry->rule, rule)) {
- kfree(entry);
- return -EINVAL;
- }
-
- if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
- entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
- list_add_rcu(&entry->list, list);
- } else {
- list_add_tail_rcu(&entry->list, list);
- }
-
- return 0;
-}
-
-static inline void audit_free_rule(struct rcu_head *head)
-{
- struct audit_entry *e = container_of(head, struct audit_entry, rcu);
- kfree(e);
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
- struct list_head *list)
-{
- struct audit_entry *e;
-
- /* Do not use the _rcu iterator here, since this is the only
- * deletion routine. */
- list_for_each_entry(e, list, list) {
- if (!audit_compare_rule(rule, &e->rule)) {
- list_del_rcu(&e->list);
- call_rcu(&e->rcu, audit_free_rule);
- return 0;
- }
- }
- return -ENOENT; /* No matching rule */
-}
-
-static int audit_list_rules(void *_dest)
-{
- int pid, seq;
- int *dest = _dest;
- struct audit_entry *entry;
- int i;
-
- pid = dest[0];
- seq = dest[1];
- kfree(dest);
-
- down(&audit_netlink_sem);
-
- /* The *_rcu iterators not needed here because we are
- always called with audit_netlink_sem held. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(entry, &audit_filter_list[i], list)
- audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
- &entry->rule, sizeof(entry->rule));
- }
- audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
-
- up(&audit_netlink_sem);
- return 0;
-}
-
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
- uid_t loginuid)
-{
- struct task_struct *tsk;
- int *dest;
- int err = 0;
- unsigned listnr;
-
- switch (type) {
- case AUDIT_LIST:
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest[0] = pid;
- dest[1] = seq;
-
- tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
- if (IS_ERR(tsk)) {
- kfree(dest);
- err = PTR_ERR(tsk);
- }
- break;
- case AUDIT_ADD:
- listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
- if (listnr >= AUDIT_NR_FILTERS)
- return -EINVAL;
-
- err = audit_add_rule(data, &audit_filter_list[listnr]);
- if (!err)
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u added an audit rule\n", loginuid);
- break;
- case AUDIT_DEL:
- listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
- if (listnr >= AUDIT_NR_FILTERS)
- return -EINVAL;
-
- err = audit_del_rule(data, &audit_filter_list[listnr]);
- if (!err)
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "auid=%u removed an audit rule\n", loginuid);
- break;
- default:
- return -EINVAL;
- }
-
- return err;
-}
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
static int audit_filter_rules(struct task_struct *tsk,
- struct audit_rule *rule,
+ struct audit_krule *rule,
struct audit_context *ctx,
enum audit_state *state)
{
int i, j;
for (i = 0; i < rule->field_count; i++) {
- u32 field = rule->fields[i] & ~AUDIT_NEGATE;
- u32 value = rule->values[i];
+ struct audit_field *f = &rule->fields[i];
int result = 0;
- switch (field) {
+ switch (f->type) {
case AUDIT_PID:
- result = (tsk->pid == value);
+ result = audit_comparator(tsk->pid, f->op, f->val);
break;
case AUDIT_UID:
- result = (tsk->uid == value);
+ result = audit_comparator(tsk->uid, f->op, f->val);
break;
case AUDIT_EUID:
- result = (tsk->euid == value);
+ result = audit_comparator(tsk->euid, f->op, f->val);
break;
case AUDIT_SUID:
- result = (tsk->suid == value);
+ result = audit_comparator(tsk->suid, f->op, f->val);
break;
case AUDIT_FSUID:
- result = (tsk->fsuid == value);
+ result = audit_comparator(tsk->fsuid, f->op, f->val);
break;
case AUDIT_GID:
- result = (tsk->gid == value);
+ result = audit_comparator(tsk->gid, f->op, f->val);
break;
case AUDIT_EGID:
- result = (tsk->egid == value);
+ result = audit_comparator(tsk->egid, f->op, f->val);
break;
case AUDIT_SGID:
- result = (tsk->sgid == value);
+ result = audit_comparator(tsk->sgid, f->op, f->val);
break;
case AUDIT_FSGID:
- result = (tsk->fsgid == value);
+ result = audit_comparator(tsk->fsgid, f->op, f->val);
break;
case AUDIT_PERS:
- result = (tsk->personality == value);
+ result = audit_comparator(tsk->personality, f->op, f->val);
break;
case AUDIT_ARCH:
- if (ctx)
- result = (ctx->arch == value);
+ if (ctx)
+ result = audit_comparator(ctx->arch, f->op, f->val);
break;
case AUDIT_EXIT:
if (ctx && ctx->return_valid)
- result = (ctx->return_code == value);
+ result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
if (ctx && ctx->return_valid) {
- if (value)
- result = (ctx->return_valid == AUDITSC_SUCCESS);
+ if (f->val)
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
- result = (ctx->return_valid == AUDITSC_FAILURE);
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
}
break;
case AUDIT_DEVMAJOR:
if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
- if (MAJOR(ctx->names[j].dev)==value) {
+ if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
++result;
break;
}
@@ -460,7 +234,7 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_DEVMINOR:
if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
- if (MINOR(ctx->names[j].dev)==value) {
+ if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
++result;
break;
}
@@ -470,7 +244,8 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_INODE:
if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
- if (ctx->names[j].ino == value) {
+ if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
+ audit_comparator(ctx->names[j].pino, f->op, f->val)) {
++result;
break;
}
@@ -480,19 +255,17 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID:
result = 0;
if (ctx)
- result = (ctx->loginuid == value);
+ result = audit_comparator(ctx->loginuid, f->op, f->val);
break;
case AUDIT_ARG0:
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
if (ctx)
- result = (ctx->argv[field-AUDIT_ARG0]==value);
+ result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
break;
}
- if (rule->fields[i] & AUDIT_NEGATE)
- result = !result;
if (!result)
return 0;
}
@@ -527,7 +300,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
* also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
static enum audit_state audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx,
@@ -541,77 +314,19 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
rcu_read_lock();
if (!list_empty(list)) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
-
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit
- && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
- rcu_read_unlock();
- return state;
- }
- }
- }
- rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
-}
-
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
- struct audit_rule *rule,
- enum audit_state *state)
-{
- int i;
-
- for (i = 0; i < rule->field_count; i++) {
- u32 field = rule->fields[i] & ~AUDIT_NEGATE;
- u32 value = rule->values[i];
- int result = 0;
-
- switch (field) {
- case AUDIT_PID:
- result = (cb->creds.pid == value);
- break;
- case AUDIT_UID:
- result = (cb->creds.uid == value);
- break;
- case AUDIT_GID:
- result = (cb->creds.gid == value);
- break;
- case AUDIT_LOGINUID:
- result = (cb->loginuid == value);
- break;
- }
-
- if (rule->fields[i] & AUDIT_NEGATE)
- result = !result;
- if (!result)
- return 0;
- }
- switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
- }
- return 1;
-}
-
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
-{
- struct audit_entry *e;
- enum audit_state state;
- int ret = 1;
-
- rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(cb, &e->rule, &state)) {
- if (state == AUDIT_DISABLED)
- ret = 0;
- break;
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit
+ && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
}
}
rcu_read_unlock();
-
- return ret; /* Audit by default */
+ return AUDIT_BUILD_CONTEXT;
}
/* This should be called with task_lock() held. */
@@ -654,17 +369,18 @@ static inline void audit_free_names(struct audit_context *context)
#if AUDIT_DEBUG == 2
if (context->auditable
||context->put_count + context->ino_count != context->name_count) {
- printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+ printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
" name_count=%d put_count=%d"
" ino_count=%d [NOT freeing]\n",
- __LINE__,
+ __FILE__, __LINE__,
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
- for (i = 0; i < context->name_count; i++)
+ for (i = 0; i < context->name_count; i++) {
printk(KERN_ERR "names[%d] = %p = %s\n", i,
context->names[i].name,
- context->names[i].name);
+ context->names[i].name ?: "(null)");
+ }
dump_stack();
return;
}
@@ -674,9 +390,13 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count = 0;
#endif
- for (i = 0; i < context->name_count; i++)
+ for (i = 0; i < context->name_count; i++) {
+ char *p = context->names[i].ctx;
+ context->names[i].ctx = NULL;
+ kfree(p);
if (context->names[i].name)
__putname(context->names[i].name);
+ }
context->name_count = 0;
if (context->pwd)
dput(context->pwd);
@@ -696,6 +416,12 @@ static inline void audit_free_aux(struct audit_context *context)
dput(axi->dentry);
mntput(axi->mnt);
}
+ if ( aux->type == AUDIT_IPC ) {
+ struct audit_aux_data_ipcctl *axi = (void *)aux;
+ if (axi->ctx)
+ kfree(axi->ctx);
+ }
+
context->aux = aux->next;
kfree(aux);
}
@@ -721,10 +447,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
return context;
}
-/* Filter on the task information and allocate a per-task audit context
+/**
+ * audit_alloc - allocate an audit context block for a task
+ * @tsk: task
+ *
+ * Filter on the task information and allocate a per-task audit context
* if necessary. Doing so turns on system call auditing for the
* specified task. This is called from copy_process, so no lock is
- * needed. */
+ * needed.
+ */
int audit_alloc(struct task_struct *tsk)
{
struct audit_context *context;
@@ -775,7 +506,37 @@ static inline void audit_free_context(struct audit_context *context)
printk(KERN_ERR "audit: freed %d contexts\n", count);
}
-static void audit_log_task_info(struct audit_buffer *ab)
+static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
+{
+ char *ctx = NULL;
+ ssize_t len = 0;
+
+ len = security_getprocattr(current, "current", NULL, 0);
+ if (len < 0) {
+ if (len != -EINVAL)
+ goto error_path;
+ return;
+ }
+
+ ctx = kmalloc(len, gfp_mask);
+ if (!ctx)
+ goto error_path;
+
+ len = security_getprocattr(current, "current", ctx, len);
+ if (len < 0 )
+ goto error_path;
+
+ audit_log_format(ab, " subj=%s", ctx);
+ return;
+
+error_path:
+ if (ctx)
+ kfree(ctx);
+ audit_panic("error in audit_log_task_context");
+ return;
+}
+
+static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
{
char name[sizeof(current->comm)];
struct mm_struct *mm = current->mm;
@@ -788,6 +549,10 @@ static void audit_log_task_info(struct audit_buffer *ab)
if (!mm)
return;
+ /*
+ * this is brittle; all callers that pass GFP_ATOMIC will have
+ * NULL current->mm and we won't get here.
+ */
down_read(&mm->mmap_sem);
vma = mm->mmap;
while (vma) {
@@ -801,6 +566,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
vma = vma->vm_next;
}
up_read(&mm->mmap_sem);
+ audit_log_task_context(ab, gfp_mask);
}
static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
@@ -808,6 +574,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
int i;
struct audit_buffer *ab;
struct audit_aux_data *aux;
+ const char *tty;
ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
if (!ab)
@@ -820,11 +587,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
+ if (current->signal->tty && current->signal->tty->name)
+ tty = current->signal->tty->name;
+ else
+ tty = "(none)";
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
" pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u",
+ " egid=%u sgid=%u fsgid=%u tty=%s",
context->argv[0],
context->argv[1],
context->argv[2],
@@ -835,8 +606,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
context->uid,
context->gid,
context->euid, context->suid, context->fsuid,
- context->egid, context->sgid, context->fsgid);
- audit_log_task_info(ab);
+ context->egid, context->sgid, context->fsgid, tty);
+ audit_log_task_info(ab, gfp_mask);
audit_log_end(ab);
for (aux = context->aux; aux; aux = aux->next) {
@@ -849,8 +620,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
case AUDIT_IPC: {
struct audit_aux_data_ipcctl *axi = (void *)aux;
audit_log_format(ab,
- " qbytes=%lx iuid=%u igid=%u mode=%x",
- axi->qbytes, axi->uid, axi->gid, axi->mode);
+ " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
+ axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
break; }
case AUDIT_SOCKETCALL: {
@@ -885,42 +656,62 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
}
}
for (i = 0; i < context->name_count; i++) {
+ unsigned long ino = context->names[i].ino;
+ unsigned long pino = context->names[i].pino;
+
ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
if (!ab)
continue; /* audit_panic has been called */
audit_log_format(ab, "item=%d", i);
- if (context->names[i].name) {
- audit_log_format(ab, " name=");
+
+ audit_log_format(ab, " name=");
+ if (context->names[i].name)
audit_log_untrustedstring(ab, context->names[i].name);
- }
- audit_log_format(ab, " flags=%x\n", context->names[i].flags);
-
- if (context->names[i].ino != (unsigned long)-1)
- audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- context->names[i].ino,
- MAJOR(context->names[i].dev),
- MINOR(context->names[i].dev),
- context->names[i].mode,
- context->names[i].uid,
- context->names[i].gid,
- MAJOR(context->names[i].rdev),
+ else
+ audit_log_format(ab, "(null)");
+
+ if (pino != (unsigned long)-1)
+ audit_log_format(ab, " parent=%lu", pino);
+ if (ino != (unsigned long)-1)
+ audit_log_format(ab, " inode=%lu", ino);
+ if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
+ audit_log_format(ab, " dev=%02x:%02x mode=%#o"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ MAJOR(context->names[i].dev),
+ MINOR(context->names[i].dev),
+ context->names[i].mode,
+ context->names[i].uid,
+ context->names[i].gid,
+ MAJOR(context->names[i].rdev),
MINOR(context->names[i].rdev));
+ if (context->names[i].ctx) {
+ audit_log_format(ab, " obj=%s",
+ context->names[i].ctx);
+ }
+
audit_log_end(ab);
}
}
-/* Free a per-task audit context. Called from copy_process and
- * __put_task_struct. */
+/**
+ * audit_free - free a per-task audit context
+ * @tsk: task whose audit context block to free
+ *
+ * Called from copy_process and __put_task_struct.
+ */
void audit_free(struct task_struct *tsk)
{
struct audit_context *context;
- task_lock(tsk);
+ /*
+ * No need to lock the task - when we execute audit_free()
+ * then the task has no external references anymore, and
+ * we are tearing it down. (The locking also confuses
+ * DEBUG_LOCKDEP - this freeing may occur in softirq
+ * contexts as well, via RCU.)
+ */
context = audit_get_context(tsk, 0, 0);
- task_unlock(tsk);
-
if (likely(!context))
return;
@@ -934,13 +725,24 @@ void audit_free(struct task_struct *tsk)
audit_free_context(context);
}
-/* Fill in audit context at syscall entry. This only happens if the
+/**
+ * audit_syscall_entry - fill in an audit record at syscall entry
+ * @tsk: task being audited
+ * @arch: architecture type
+ * @major: major syscall type (function)
+ * @a1: additional syscall register 1
+ * @a2: additional syscall register 2
+ * @a3: additional syscall register 3
+ * @a4: additional syscall register 4
+ *
+ * Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
* per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
- * be written). */
+ * be written).
+ */
void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
@@ -950,7 +752,8 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
BUG_ON(!context);
- /* This happens only on certain architectures that make system
+ /*
+ * This happens only on certain architectures that make system
* calls in kernel_thread via the entry.S interface, instead of
* with direct calls. (If you are porting to a new
* architecture, hitting this condition can indicate that you
@@ -958,7 +761,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
*
* i386 no
* x86_64 no
- * ppc64 yes (see arch/ppc64/kernel/misc.S)
+ * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S)
*
* This also happens with vm86 emulation in a non-nested manner
* (entries without exits), so this case must be caught.
@@ -966,11 +769,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
if (context->in_syscall) {
struct audit_context *newctx;
-#if defined(__NR_vm86) && defined(__NR_vm86old)
- /* vm86 mode should only be entered once */
- if (major == __NR_vm86 || major == __NR_vm86old)
- return;
-#endif
#if AUDIT_DEBUG
printk(KERN_ERR
"audit(:%d) pid=%d in syscall=%d;"
@@ -1014,11 +812,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
}
-/* Tear down after system call. If the audit context has been marked as
+/**
+ * audit_syscall_exit - deallocate audit context after a system call
+ * @tsk: task being audited
+ * @valid: success/failure flag
+ * @return_code: syscall return value
+ *
+ * Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_RECORD_CONTEXT state from
* filtering, or because some other part of the kernel write an audit
* message), then write out the syscall information. In call cases,
- * free the names stored from getname(). */
+ * free the names stored from getname().
+ */
void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
{
struct audit_context *context;
@@ -1053,7 +858,13 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
put_task_struct(tsk);
}
-/* Add a name to the list. Called from fs/namei.c:getname(). */
+/**
+ * audit_getname - add a name to the list
+ * @name: name to add
+ *
+ * Add a name to the list of audit names for this context.
+ * Called from fs/namei.c:getname().
+ */
void audit_getname(const char *name)
{
struct audit_context *context = current->audit_context;
@@ -1082,10 +893,13 @@ void audit_getname(const char *name)
}
-/* Intercept a putname request. Called from
- * include/linux/fs.h:putname(). If we have stored the name from
- * getname in the audit context, then we delay the putname until syscall
- * exit. */
+/* audit_putname - intercept a putname request
+ * @name: name to intercept and delay for putname
+ *
+ * If we have stored the name from getname in the audit context,
+ * then we delay the putname until syscall exit.
+ * Called from include/linux/fs.h:putname().
+ */
void audit_putname(const char *name)
{
struct audit_context *context = current->audit_context;
@@ -1100,7 +914,7 @@ void audit_putname(const char *name)
for (i = 0; i < context->name_count; i++)
printk(KERN_ERR "name[%d] = %p = %s\n", i,
context->names[i].name,
- context->names[i].name);
+ context->names[i].name ?: "(null)");
}
#endif
__putname(name);
@@ -1122,9 +936,52 @@ void audit_putname(const char *name)
#endif
}
-/* Store the inode and device from a lookup. Called from
- * fs/namei.c:path_lookup(). */
-void audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void audit_inode_context(int idx, const struct inode *inode)
+{
+ struct audit_context *context = current->audit_context;
+ const char *suffix = security_inode_xattr_getsuffix();
+ char *ctx = NULL;
+ int len = 0;
+
+ if (!suffix)
+ goto ret;
+
+ len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
+ if (len == -EOPNOTSUPP)
+ goto ret;
+ if (len < 0)
+ goto error_path;
+
+ ctx = kmalloc(len, GFP_KERNEL);
+ if (!ctx)
+ goto error_path;
+
+ len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
+ if (len < 0)
+ goto error_path;
+
+ kfree(context->names[idx].ctx);
+ context->names[idx].ctx = ctx;
+ goto ret;
+
+error_path:
+ if (ctx)
+ kfree(ctx);
+ audit_panic("error in audit_inode_context");
+ret:
+ return;
+}
+
+
+/**
+ * audit_inode - store the inode and device from a lookup
+ * @name: name being audited
+ * @inode: inode being audited
+ * @flags: lookup flags (as used in path_lookup())
+ *
+ * Called from fs/namei.c:path_lookup().
+ */
+void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
{
int idx;
struct audit_context *context = current->audit_context;
@@ -1150,15 +1007,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
++context->ino_count;
#endif
}
- context->names[idx].flags = flags;
- context->names[idx].ino = inode->i_ino;
context->names[idx].dev = inode->i_sb->s_dev;
context->names[idx].mode = inode->i_mode;
context->names[idx].uid = inode->i_uid;
context->names[idx].gid = inode->i_gid;
context->names[idx].rdev = inode->i_rdev;
+ audit_inode_context(idx, inode);
+ if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
+ (strcmp(name, ".") != 0)) {
+ context->names[idx].ino = (unsigned long)-1;
+ context->names[idx].pino = inode->i_ino;
+ } else {
+ context->names[idx].ino = inode->i_ino;
+ context->names[idx].pino = (unsigned long)-1;
+ }
+}
+
+/**
+ * audit_inode_child - collect inode info for created/removed objects
+ * @dname: inode's dentry name
+ * @inode: inode being audited
+ * @pino: inode number of dentry parent
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created. Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const char *dname, const struct inode *inode,
+ unsigned long pino)
+{
+ int idx;
+ struct audit_context *context = current->audit_context;
+
+ if (!context->in_syscall)
+ return;
+
+ /* determine matching parent */
+ if (dname)
+ for (idx = 0; idx < context->name_count; idx++)
+ if (context->names[idx].pino == pino) {
+ const char *n;
+ const char *name = context->names[idx].name;
+ int dlen = strlen(dname);
+ int nlen = name ? strlen(name) : 0;
+
+ if (nlen < dlen)
+ continue;
+
+ /* disregard trailing slashes */
+ n = name + nlen - 1;
+ while ((*n == '/') && (n > name))
+ n--;
+
+ /* find last path component */
+ n = n - dlen + 1;
+ if (n < name)
+ continue;
+ else if (n > name) {
+ if (*--n != '/')
+ continue;
+ else
+ n++;
+ }
+
+ if (strncmp(n, dname, dlen) == 0)
+ goto update_context;
+ }
+
+ /* catch-all in case match not found */
+ idx = context->name_count++;
+ context->names[idx].name = NULL;
+ context->names[idx].pino = pino;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+
+update_context:
+ if (inode) {
+ context->names[idx].ino = inode->i_ino;
+ context->names[idx].dev = inode->i_sb->s_dev;
+ context->names[idx].mode = inode->i_mode;
+ context->names[idx].uid = inode->i_uid;
+ context->names[idx].gid = inode->i_gid;
+ context->names[idx].rdev = inode->i_rdev;
+ audit_inode_context(idx, inode);
+ }
}
+/**
+ * auditsc_get_stamp - get local copies of audit_context values
+ * @ctx: audit_context for the task
+ * @t: timespec to store time recorded in the audit_context
+ * @serial: serial value that is recorded in the audit_context
+ *
+ * Also sets the context as auditable.
+ */
void auditsc_get_stamp(struct audit_context *ctx,
struct timespec *t, unsigned int *serial)
{
@@ -1170,6 +1117,15 @@ void auditsc_get_stamp(struct audit_context *ctx,
ctx->auditable = 1;
}
+/**
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
{
if (task->audit_context) {
@@ -1188,12 +1144,59 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
return 0;
}
+/**
+ * audit_get_loginuid - get the loginuid for an audit_context
+ * @ctx: the audit_context
+ *
+ * Returns the context's loginuid or -1 if @ctx is NULL.
+ */
uid_t audit_get_loginuid(struct audit_context *ctx)
{
return ctx ? ctx->loginuid : -1;
}
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
+{
+ struct audit_context *context = current->audit_context;
+ char *ctx = NULL;
+ int len = 0;
+
+ if (likely(!context))
+ return NULL;
+
+ len = security_ipc_getsecurity(ipcp, NULL, 0);
+ if (len == -EOPNOTSUPP)
+ goto ret;
+ if (len < 0)
+ goto error_path;
+
+ ctx = kmalloc(len, GFP_ATOMIC);
+ if (!ctx)
+ goto error_path;
+
+ len = security_ipc_getsecurity(ipcp, ctx, len);
+ if (len < 0)
+ goto error_path;
+
+ return ctx;
+
+error_path:
+ kfree(ctx);
+ audit_panic("error in audit_ipc_context");
+ret:
+ return NULL;
+}
+
+/**
+ * audit_ipc_perms - record audit data for ipc
+ * @qbytes: msgq bytes
+ * @uid: msgq user id
+ * @gid: msgq group id
+ * @mode: msgq mode (permissions)
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
{
struct audit_aux_data_ipcctl *ax;
struct audit_context *context = current->audit_context;
@@ -1201,7 +1204,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
if (likely(!context))
return 0;
- ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
if (!ax)
return -ENOMEM;
@@ -1209,6 +1212,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
ax->uid = uid;
ax->gid = gid;
ax->mode = mode;
+ ax->ctx = audit_ipc_context(ipcp);
ax->d.type = AUDIT_IPC;
ax->d.next = context->aux;
@@ -1216,6 +1220,13 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
return 0;
}
+/**
+ * audit_socketcall - record audit data for sys_socketcall
+ * @nargs: number of args
+ * @args: args array
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
int audit_socketcall(int nargs, unsigned long *args)
{
struct audit_aux_data_socketcall *ax;
@@ -1237,6 +1248,13 @@ int audit_socketcall(int nargs, unsigned long *args)
return 0;
}
+/**
+ * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * @len: data length in user space
+ * @a: data address in kernel space
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
int audit_sockaddr(int len, void *a)
{
struct audit_aux_data_sockaddr *ax;
@@ -1258,6 +1276,15 @@ int audit_sockaddr(int len, void *a)
return 0;
}
+/**
+ * audit_avc_path - record the granting or denial of permissions
+ * @dentry: dentry to record
+ * @mnt: mnt to record
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ *
+ * Called from security/selinux/avc.c::avc_audit()
+ */
int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
{
struct audit_aux_data_path *ax;
@@ -1279,6 +1306,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
return 0;
}
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
void audit_signal_info(int sig, struct task_struct *t)
{
extern pid_t audit_sig_pid;
@@ -1295,4 +1330,3 @@ void audit_signal_info(int sig, struct task_struct *t)
}
}
}
-
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16f2..1a4d8a40d3f9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
return ret;
}
+
+int __capable(struct task_struct *t, int cap)
+{
+ if (security_capable(t, cap) == 0) {
+ t->flags |= PF_SUPERPRIV;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__capable);
+
+int capable(int cap)
+{
+ return __capable(current, cap);
+}
+EXPORT_SYMBOL(capable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b6785..c1601a84f8d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,10 +17,10 @@
#include <linux/time.h>
#include <linux/signal.h>
#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h> /* for FUTEX_WAIT */
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
+#include <linux/timex.h>
#include <asm/uaccess.h>
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
return ret;
}
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- int val3)
-{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
- int val2 = 0;
-
- if ((op == FUTEX_WAIT) && utime) {
- if (get_compat_timespec(&t, utime))
- return -EFAULT;
- timeout = timespec_to_jiffies(&t) + 1;
- }
- if (op >= FUTEX_REQUEUE)
- val2 = (int) (unsigned long) utime;
-
- return do_futex((unsigned long)uaddr, op, val, timeout,
- (unsigned long)uaddr2, val2, val3);
-}
-#endif
-
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
return -ERESTARTNOHAND;
}
#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
+
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+{
+ struct timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc.modes, &utp->modes) ||
+ __get_user(txc.offset, &utp->offset) ||
+ __get_user(txc.freq, &utp->freq) ||
+ __get_user(txc.maxerror, &utp->maxerror) ||
+ __get_user(txc.esterror, &utp->esterror) ||
+ __get_user(txc.status, &utp->status) ||
+ __get_user(txc.constant, &utp->constant) ||
+ __get_user(txc.precision, &utp->precision) ||
+ __get_user(txc.tolerance, &utp->tolerance) ||
+ __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc.tick, &utp->tick) ||
+ __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc.jitter, &utp->jitter) ||
+ __get_user(txc.shift, &utp->shift) ||
+ __get_user(txc.stabil, &utp->stabil) ||
+ __get_user(txc.jitcnt, &utp->jitcnt) ||
+ __get_user(txc.calcnt, &utp->calcnt) ||
+ __get_user(txc.errcnt, &utp->errcnt) ||
+ __get_user(txc.stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ ret = do_adjtimex(&txc);
+
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc.modes, &utp->modes) ||
+ __put_user(txc.offset, &utp->offset) ||
+ __put_user(txc.freq, &utp->freq) ||
+ __put_user(txc.maxerror, &utp->maxerror) ||
+ __put_user(txc.esterror, &utp->esterror) ||
+ __put_user(txc.status, &utp->status) ||
+ __put_user(txc.constant, &utp->constant) ||
+ __put_user(txc.precision, &utp->precision) ||
+ __put_user(txc.tolerance, &utp->tolerance) ||
+ __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc.tick, &utp->tick) ||
+ __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc.jitter, &utp->jitter) ||
+ __put_user(txc.shift, &utp->shift) ||
+ __put_user(txc.stabil, &utp->stabil) ||
+ __put_user(txc.jitcnt, &utp->jitcnt) ||
+ __put_user(txc.calcnt, &utp->calcnt) ||
+ __put_user(txc.errcnt, &utp->errcnt) ||
+ __put_user(txc.stbcnt, &utp->stbcnt))
+ ret = -EFAULT;
+
+ return ret;
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e882c6babf41..fe2b8d0bfe4c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
/* This protects CPUs going up and down... */
static DECLARE_MUTEX(cpucontrol);
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
#ifdef CONFIG_HOTPLUG_CPU
static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
/* Need to know about CPUs going up/down? */
int register_cpu_notifier(struct notifier_block *nb)
{
- int ret;
-
- if ((ret = lock_cpu_hotplug_interruptible()) != 0)
- return ret;
- ret = notifier_chain_register(&cpu_chain, nb);
- unlock_cpu_hotplug();
- return ret;
+ return blocking_notifier_chain_register(&cpu_chain, nb);
}
EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
- lock_cpu_hotplug();
- notifier_chain_unregister(&cpu_chain, nb);
- unlock_cpu_hotplug();
+ blocking_notifier_chain_unregister(&cpu_chain, nb);
}
EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
goto out;
}
- err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+ err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+ if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
put_cpu();
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
- == NOTIFY_BAD)
+ if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+ (void *)(long)cpu) == NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
goto out;
}
- ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu)
ret = __cpu_up(cpu);
if (ret != 0)
goto out_notify;
- if (!cpu_online(cpu))
- BUG();
+ BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
out_notify:
if (ret != 0)
- notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+ blocking_notifier_call_chain(&cpu_chain,
+ CPU_UP_CANCELED, hcpu);
out:
unlock_cpu_hotplug();
return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a05..18aea1bd1284 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
* Processor and Memory placement constraints for sets of tasks.
*
* Copyright (C) 2003 BULL SA.
- * Copyright (C) 2004 Silicon Graphics, Inc.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
- * Portions Copyright (c) 2004 Silicon Graphics, Inc.
*
- * 2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ * 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
- * 2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ * 2004 May-July Rework by Paul Jackson.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
@@ -53,7 +52,7 @@
#include <asm/uaccess.h>
#include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
#define CPUSET_SUPER_MAGIC 0x27e0eb
@@ -108,37 +107,49 @@ typedef enum {
CS_MEM_EXCLUSIVE,
CS_MEMORY_MIGRATE,
CS_REMOVED,
- CS_NOTIFY_ON_RELEASE
+ CS_NOTIFY_ON_RELEASE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
} cpuset_flagbits_t;
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
- return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}
static inline int is_mem_exclusive(const struct cpuset *cs)
{
- return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}
static inline int is_removed(const struct cpuset *cs)
{
- return !!test_bit(CS_REMOVED, &cs->flags);
+ return test_bit(CS_REMOVED, &cs->flags);
}
static inline int notify_on_release(const struct cpuset *cs)
{
- return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+ return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
}
static inline int is_memory_migrate(const struct cpuset *cs)
{
- return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
/*
- * Increment this atomic integer everytime any cpuset changes its
+ * Increment this integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
* number, and avoid having to lock and reload mems_allowed unless
* the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
* on every visit to __alloc_pages(), to efficiently check whether
* its current->cpuset->mems_allowed has changed, requiring an update
* of its current->mems_allowed.
+ *
+ * Since cpuset_mems_generation is guarded by manage_mutex,
+ * there is no need to mark it atomic.
*/
-static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+static int cpuset_mems_generation;
static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount;
static struct super_block *cpuset_sb;
/*
- * We have two global cpuset semaphores below. They can nest.
- * It is ok to first take manage_sem, then nest callback_sem. We also
+ * We have two global cpuset mutexes below. They can nest.
+ * It is ok to first take manage_mutex, then nest callback_mutex. We also
* require taking task_lock() when dereferencing a tasks cpuset pointer.
* See "The task_lock() exception", at the end of this comment.
*
- * A task must hold both semaphores to modify cpusets. If a task
- * holds manage_sem, then it blocks others wanting that semaphore,
- * ensuring that it is the only task able to also acquire callback_sem
+ * A task must hold both mutexes to modify cpusets. If a task
+ * holds manage_mutex, then it blocks others wanting that mutex,
+ * ensuring that it is the only task able to also acquire callback_mutex
* and be able to modify cpusets. It can perform various checks on
* the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding manage_sem. While it is
+ * also allocate memory while just holding manage_mutex. While it is
* performing these checks, various callback routines can briefly
- * acquire callback_sem to query cpusets. Once it is ready to make
- * the changes, it takes callback_sem, blocking everyone else.
+ * acquire callback_mutex to query cpusets. Once it is ready to make
+ * the changes, it takes callback_mutex, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
- * callback_sem, as that would risk double tripping on callback_sem
+ * callback_mutex, as that would risk double tripping on callback_mutex
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
- * If a task is only holding callback_sem, then it has read-only
+ * If a task is only holding callback_mutex, then it has read-only
* access to cpusets.
*
* The task_struct fields mems_allowed and mems_generation may only
* be accessed in the context of that task, so require no locks.
*
* Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_sem or callback_sem can't rely
+ * So in general, code holding manage_mutex or callback_mutex can't rely
* on the count field not changing. However, if the count goes to
- * zero, then only attach_task(), which holds both semaphores, can
+ * zero, then only attach_task(), which holds both mutexes, can
* increment it again. Because a count of zero means that no tasks
* are currently attached, therefore there is no way a task attached
* to that cpuset can fork (the other way to increment the count).
- * So code holding manage_sem or callback_sem can safely assume that
+ * So code holding manage_mutex or callback_mutex can safely assume that
* if the count is zero, it will stay zero. Similarly, if a task
- * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * holds manage_mutex or callback_mutex on a cpuset with zero count, it
* knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those semaphores.
- *
- * A possible optimization to improve parallelism would be to make
- * callback_sem a R/W semaphore (rwsem), allowing the callback routines
- * to proceed in parallel, with read access, until the holder of
- * manage_sem needed to take this rwsem for exclusive write access
- * and modify some cpusets.
+ * both of those mutexes.
*
* The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_sem across the entire operation,
+ * the cpuset hierarchy holds manage_mutex across the entire operation,
* single threading all such cpuset modifications across the system.
*
- * The cpuset_common_file_read() handlers only hold callback_sem across
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
* The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either semaphore. These are the two most performance
+ * (usually) take either mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits. Then manage_sem
+ * when a task in a notify_on_release cpuset exits. Then manage_mutex
* is taken, and if the cpuset count is zero, a usermode call made
* to /sbin/cpuset_release_agent with the name of the cpuset (path
* relative to the root of cpuset file system) as the argument.
@@ -242,9 +250,9 @@ static struct super_block *cpuset_sb;
*
* The need for this exception arises from the action of attach_task(),
* which overwrites one tasks cpuset pointer with another. It does
- * so using both semaphores, however there are several performance
+ * so using both mutexes, however there are several performance
* critical places that need to reference task->cpuset without the
- * expense of grabbing a system global semaphore. Therefore except as
+ * expense of grabbing a system global mutex. Therefore except as
* noted below, when dereferencing or, as in attach_task(), modifying
* a tasks cpuset pointer we use task_lock(), which acts on a spinlock
* (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +264,8 @@ static struct super_block *cpuset_sb;
* the routine cpuset_update_task_memory_state().
*/
-static DECLARE_MUTEX(manage_sem);
-static DECLARE_MUTEX(callback_sem);
+static DEFINE_MUTEX(manage_mutex);
+static DEFINE_MUTEX(callback_mutex);
/*
* A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
}
/*
- * Call with manage_sem held. Writes path of cpuset into buf.
+ * Call with manage_mutex held. Writes path of cpuset into buf.
* Returns 0 on success, -errno on error.
*/
@@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
* status of the /sbin/cpuset_release_agent task, so no sense holding
* our caller up for that.
*
- * When we had only one cpuset semaphore, we had to call this
+ * When we had only one cpuset mutex, we had to call this
* without holding it, to avoid deadlock when call_usermodehelper()
* allocated memory. With two locks, we could now call this while
- * holding manage_sem, but we still don't, so as to minimize
- * the time manage_sem is held.
+ * holding manage_mutex, but we still don't, so as to minimize
+ * the time manage_mutex is held.
*/
static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf)
* cs is notify_on_release() and now both the user count is zero and
* the list of children is empty, prepare cpuset path in a kmalloc'd
* buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_sem is dropped.
- * Call here with manage_sem held.
+ * cpuset_release_agent() with it later on, once manage_mutex is dropped.
+ * Call here with manage_mutex held.
*
* This check_for_release() routine is responsible for kmalloc'ing
* pathbuf. The above cpuset_release_agent() is responsible for
* kfree'ing pathbuf. The caller of these routines is responsible
* for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_sem held and the address
- * of the pathbuf pointer, then dropping manage_sem, then calling
+ * calling check_for_release() with manage_mutex held and the address
+ * of the pathbuf pointer, then dropping manage_mutex, then calling
* cpuset_release_agent() with pathbuf, as set by check_for_release().
*/
@@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_map.
*
- * Call with callback_sem held.
+ * Call with callback_mutex held.
*/
static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_online_map.
*
- * Call with callback_sem held.
+ * Call with callback_mutex held.
*/
static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +616,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* current->cpuset if a task has its memory placement changed.
* Do not call this routine if in_interrupt().
*
- * Call without callback_sem or task_lock() held. May be called
- * with or without manage_sem held. Doesn't need task_lock to guard
+ * Call without callback_mutex or task_lock() held. May be called
+ * with or without manage_mutex held. Doesn't need task_lock to guard
* against another task changing a non-NULL cpuset pointer to NULL,
* as that is only done by a task on itself, and if the current task
* is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer. This routine also might acquire callback_sem and
+ * cpuset pointer. This routine also might acquire callback_mutex and
* current->mm->mmap_sem during call.
*
* Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +666,21 @@ void cpuset_update_task_memory_state(void)
}
if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
task_lock(tsk);
cs = tsk->cpuset; /* Maybe changed when task not locked */
guarantee_online_mems(cs, &tsk->mems_allowed);
tsk->cpuset_mems_generation = cs->mems_generation;
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
task_unlock(tsk);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
mpol_rebind_task(tsk, &tsk->mems_allowed);
}
}
@@ -674,7 +690,7 @@ void cpuset_update_task_memory_state(void)
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding manage_sem.
+ * are only set if the other's are set. Call holding manage_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +708,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * manage_sem held.
+ * manage_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -746,7 +762,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
* exclusive child cpusets
* Build these two partitions by calling partition_sched_domains
*
- * Call with manage_sem held. May nest a call to the
+ * Call with manage_mutex held. May nest a call to the
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
*/
@@ -792,7 +808,7 @@ static void update_cpu_domains(struct cpuset *cur)
}
/*
- * Call with manage_sem held. May take callback_sem during call.
+ * Call with manage_mutex held. May take callback_mutex during call.
*/
static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +827,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
if (retval < 0)
return retval;
cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
cs->cpus_allowed = trialcs.cpus_allowed;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
if (is_cpu_exclusive(cs) && !cpus_unchanged)
update_cpu_domains(cs);
return 0;
@@ -827,7 +843,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
* the cpuset is marked 'memory_migrate', migrate the tasks
* pages to the new memory.
*
- * Call with manage_sem held. May take callback_sem during call.
+ * Call with manage_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +878,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
if (retval < 0)
goto done;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs.mems_allowed;
- atomic_inc(&cpuset_mems_generation);
- cs->mems_generation = atomic_read(&cpuset_mems_generation);
- up(&callback_sem);
+ cs->mems_generation = cpuset_mems_generation++;
+ mutex_unlock(&callback_mutex);
set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
@@ -922,7 +937,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
* tasklist_lock. Forks can happen again now - the mpol_copy()
* cpuset_being_rebound check will catch such forks, and rebind
* their vma mempolicies too. Because we still hold the global
- * cpuset manage_sem, we know that no other rebind effort will
+ * cpuset manage_mutex, we know that no other rebind effort will
* be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -948,7 +963,7 @@ done:
}
/*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
*/
static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -963,11 +978,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
+ * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
* cs: the cpuset to update
* buf: the buffer where we read the 0 or 1
*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +1005,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
return err;
cpu_exclusive_changed =
(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
if (turning_on)
set_bit(bit, &cs->flags);
else
clear_bit(bit, &cs->flags);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
if (cpu_exclusive_changed)
update_cpu_domains(cs);
@@ -1104,7 +1120,7 @@ static int fmeter_getrate(struct fmeter *fmp)
* writing the path of the old cpuset in 'ppathbuf' if it needs to be
* notified on release.
*
- * Call holding manage_sem. May take callback_sem and task_lock of
+ * Call holding manage_mutex. May take callback_mutex and task_lock of
* the task 'pid' during call.
*/
@@ -1144,13 +1160,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
get_task_struct(tsk);
}
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
task_lock(tsk);
oldcs = tsk->cpuset;
if (!oldcs) {
task_unlock(tsk);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
put_task_struct(tsk);
return -ESRCH;
}
@@ -1164,7 +1180,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
from = oldcs->mems_allowed;
to = cs->mems_allowed;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
mm = get_task_mm(tsk);
if (mm) {
@@ -1194,6 +1210,8 @@ typedef enum {
FILE_NOTIFY_ON_RELEASE,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
FILE_TASKLIST,
} cpuset_filetype_t;
@@ -1221,7 +1239,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
}
buffer[nbytes] = 0; /* nul-terminate */
- down(&manage_sem);
+ mutex_lock(&manage_mutex);
if (is_removed(cs)) {
retval = -ENODEV;
@@ -1253,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
case FILE_MEMORY_PRESSURE:
retval = -EACCES;
break;
+ case FILE_SPREAD_PAGE:
+ retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+ cs->mems_generation = cpuset_mems_generation++;
+ break;
+ case FILE_SPREAD_SLAB:
+ retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+ cs->mems_generation = cpuset_mems_generation++;
+ break;
case FILE_TASKLIST:
retval = attach_task(cs, buffer, &pathbuf);
break;
@@ -1264,7 +1290,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
if (retval == 0)
retval = nbytes;
out2:
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
cpuset_release_agent(pathbuf);
out1:
kfree(buffer);
@@ -1304,9 +1330,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
cpumask_t mask;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
mask = cs->cpus_allowed;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
return cpulist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -1315,9 +1341,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
nodemask_t mask;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
mask = cs->mems_allowed;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -1362,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
case FILE_MEMORY_PRESSURE:
s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
break;
+ case FILE_SPREAD_PAGE:
+ *s++ = is_spread_page(cs) ? '1' : '0';
+ break;
+ case FILE_SPREAD_SLAB:
+ *s++ = is_spread_slab(cs) ? '1' : '0';
+ break;
default:
retval = -EINVAL;
goto out;
@@ -1598,7 +1630,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
* Handle an open on 'tasks' file. Prepare a buffer listing the
* process id's of tasks currently attached to the cpuset being opened.
*
- * Does not require any specific cpuset semaphores, and does not take any.
+ * Does not require any specific cpuset mutexes, and does not take any.
*/
static int cpuset_tasks_open(struct inode *unused, struct file *file)
{
@@ -1725,6 +1757,16 @@ static struct cftype cft_memory_pressure = {
.private = FILE_MEMORY_PRESSURE,
};
+static struct cftype cft_spread_page = {
+ .name = "memory_spread_page",
+ .private = FILE_SPREAD_PAGE,
+};
+
+static struct cftype cft_spread_slab = {
+ .name = "memory_spread_slab",
+ .private = FILE_SPREAD_SLAB,
+};
+
static int cpuset_populate_dir(struct dentry *cs_dentry)
{
int err;
@@ -1743,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
+ return err;
if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
return err;
return 0;
@@ -1754,7 +1800,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
* name: name of the new cpuset. Will be strcpy'ed.
* mode: mode to set on new inode
*
- * Must be called with the semaphore on the parent inode held
+ * Must be called with the mutex on the parent inode held
*/
static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,44 +1812,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
if (!cs)
return -ENOMEM;
- down(&manage_sem);
+ mutex_lock(&manage_mutex);
cpuset_update_task_memory_state();
cs->flags = 0;
if (notify_on_release(parent))
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
cs->cpus_allowed = CPU_MASK_NONE;
cs->mems_allowed = NODE_MASK_NONE;
atomic_set(&cs->count, 0);
INIT_LIST_HEAD(&cs->sibling);
INIT_LIST_HEAD(&cs->children);
- atomic_inc(&cpuset_mems_generation);
- cs->mems_generation = atomic_read(&cpuset_mems_generation);
+ cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
cs->parent = parent;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
list_add(&cs->sibling, &cs->parent->children);
number_of_cpusets++;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
err = cpuset_create_dir(cs, name, mode);
if (err < 0)
goto err;
/*
- * Release manage_sem before cpuset_populate_dir() because it
+ * Release manage_mutex before cpuset_populate_dir() because it
* will down() this new directory's i_mutex and if we race with
* another mkdir, we might deadlock.
*/
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
err = cpuset_populate_dir(cs->dentry);
/* If err < 0, we have a half-filled directory - oh well ;) */
return 0;
err:
list_del(&cs->sibling);
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
kfree(cs);
return err;
}
@@ -1825,18 +1874,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
/* the vfs holds both inode->i_mutex already */
- down(&manage_sem);
+ mutex_lock(&manage_mutex);
cpuset_update_task_memory_state();
if (atomic_read(&cs->count) > 0) {
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
return -EBUSY;
}
if (!list_empty(&cs->children)) {
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
return -EBUSY;
}
parent = cs->parent;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
if (is_cpu_exclusive(cs))
update_cpu_domains(cs);
@@ -1848,10 +1897,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
cpuset_d_remove_dir(d);
dput(d);
number_of_cpusets--;
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
cpuset_release_agent(pathbuf);
return 0;
}
@@ -1867,7 +1916,7 @@ int __init cpuset_init_early(void)
struct task_struct *tsk = current;
tsk->cpuset = &top_cpuset;
- tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+ tsk->cpuset->mems_generation = cpuset_mems_generation++;
return 0;
}
@@ -1886,8 +1935,7 @@ int __init cpuset_init(void)
top_cpuset.mems_allowed = NODE_MASK_ALL;
fmeter_init(&top_cpuset.fmeter);
- atomic_inc(&cpuset_mems_generation);
- top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
+ top_cpuset.mems_generation = cpuset_mems_generation++;
init_task.cpuset = &top_cpuset;
@@ -1960,25 +2008,25 @@ void cpuset_fork(struct task_struct *child)
* Description: Detach cpuset from @tsk and release it.
*
* Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_sem semaphore when exiting.
+ * them to take the global manage_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cpusets where very high task exit scaling
* is required on large systems.
*
* Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_sem
- * or callback_sem. Otherwise a zero cpuset use count is a license to
+ * goes to zero, except inside a critical section guarded by manage_mutex
+ * or callback_mutex. Otherwise a zero cpuset use count is a license to
* any other task to nuke the cpuset immediately, via cpuset_rmdir().
*
- * This routine has to take manage_sem, not callback_sem, because
- * it is holding that semaphore while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback__sem().
+ * This routine has to take manage_mutex, not callback_mutex, because
+ * it is holding that mutex while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback_mutex().
*
* We don't need to task_lock() this reference to tsk->cpuset,
* because tsk is already marked PF_EXITING, so attach_task() won't
* mess with it, or task is a failed fork, never visible to attach_task.
*
- * Hack:
+ * the_top_cpuset_hack:
*
* Set the exiting tasks cpuset to the root cpuset (top_cpuset).
*
@@ -2017,15 +2065,15 @@ void cpuset_exit(struct task_struct *tsk)
struct cpuset *cs;
cs = tsk->cpuset;
- tsk->cpuset = &top_cpuset; /* Hack - see comment above */
+ tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
if (notify_on_release(cs)) {
char *pathbuf = NULL;
- down(&manage_sem);
+ mutex_lock(&manage_mutex);
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
cpuset_release_agent(pathbuf);
} else {
atomic_dec(&cs->count);
@@ -2046,11 +2094,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
{
cpumask_t mask;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
task_lock(tsk);
guarantee_online_cpus(tsk->cpuset, &mask);
task_unlock(tsk);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
return mask;
}
@@ -2074,11 +2122,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
task_lock(tsk);
guarantee_online_mems(tsk->cpuset, &mask);
task_unlock(tsk);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
return mask;
}
@@ -2104,7 +2152,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
/*
* nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset. Call holding callback_sem.
+ * ancestor to the specified cpuset. Call holding callback_mutex.
* If no ancestor is mem_exclusive (an unusual configuration), then
* returns the root cpuset.
*/
@@ -2131,12 +2179,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest mem_exclusive ancestor cpuset.
*
- * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
* routine only calls here with __GFP_HARDWALL bit _not_ set if
* it's a GFP_KERNEL allocation, and all nodes in the current tasks
* mems_allowed came up empty on the first pass over the zonelist.
* So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_sem semaphore.
+ * short of memory, might require taking the callback_mutex mutex.
*
* The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
* calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2157,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
- int allowed = 1; /* is allocation in zone z allowed? */
+ int allowed; /* is allocation in zone z allowed? */
if (in_interrupt())
return 1;
@@ -2171,31 +2219,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
return 1;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
task_lock(current);
cs = nearest_exclusive_ancestor(current->cpuset);
task_unlock(current);
allowed = node_isset(node, cs->mems_allowed);
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
return allowed;
}
/**
* cpuset_lock - lock out any changes to cpuset structures
*
- * The out of memory (oom) code needs to lock down cpusets
+ * The out of memory (oom) code needs to mutex_lock cpusets
* from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_sem via this
+ * task in an overlapping cpuset. Expose callback_mutex via this
* cpuset_lock() routine, so the oom code can lock it, before
* locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_sem.
+ * must be taken inside callback_mutex.
*/
void cpuset_lock(void)
{
- down(&callback_sem);
+ mutex_lock(&callback_mutex);
}
/**
@@ -2206,10 +2254,48 @@ void cpuset_lock(void)
void cpuset_unlock(void)
{
- up(&callback_sem);
+ mutex_unlock(&callback_mutex);
}
/**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online. So it
+ * should not be possible for the following code to return an
+ * offline node. But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start. The zonelist passed to
+ * __alloc_pages() will include all nodes. If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+int cpuset_mem_spread_node(void)
+{
+ int node;
+
+ node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+ if (node == MAX_NUMNODES)
+ node = first_node(current->mems_allowed);
+ current->cpuset_mem_spread_rotor = node;
+ return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
+/**
* cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
* @p: pointer to task_struct of some other task.
*
@@ -2218,7 +2304,7 @@ void cpuset_unlock(void)
* determine if task @p's memory usage might impact the memory
* available to the current task.
*
- * Call while holding callback_sem.
+ * Call while holding callback_mutex.
**/
int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,13 +2375,13 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take manage_sem, keeping attach_task() from changing it
- * anyway.
+ * and we take manage_mutex, keeping attach_task() from changing it
+ * anyway. No need to check that tsk->cpuset != NULL, thanks to
+ * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
+ * cpuset to top_cpuset.
*/
-
static int proc_cpuset_show(struct seq_file *m, void *v)
{
- struct cpuset *cs;
struct task_struct *tsk;
char *buf;
int retval = 0;
@@ -2305,20 +2391,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
return -ENOMEM;
tsk = m->private;
- down(&manage_sem);
- cs = tsk->cpuset;
- if (!cs) {
- retval = -EINVAL;
- goto out;
- }
-
- retval = cpuset_path(cs, buf, PAGE_SIZE);
+ mutex_lock(&manage_mutex);
+ retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
if (retval < 0)
goto out;
seq_puts(m, buf);
seq_putc(m, '\n');
out:
- up(&manage_sem);
+ mutex_unlock(&manage_mutex);
kfree(buf);
return retval;
}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 867d6dbeb574..c01cead2cfd6 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -140,6 +140,7 @@ __set_personality(u_long personality)
ep = lookup_exec_domain(personality);
if (ep == current_thread_info()->exec_domain) {
current->personality = personality;
+ module_put(ep->module);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 531aadca5530..bc0ec674d3f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,11 @@
#include <linux/cpuset.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -48,15 +51,80 @@ static void __unhash_process(struct task_struct *p)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
- detach_pid(p, PIDTYPE_TGID);
if (thread_group_leader(p)) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
- if (p->pid)
- __get_cpu_var(process_counts)--;
+
+ list_del_init(&p->tasks);
+ __get_cpu_var(process_counts)--;
}
+ list_del_rcu(&p->thread_group);
+ remove_parent(p);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct sighand_struct *sighand;
+
+ BUG_ON(!sig);
+ BUG_ON(!atomic_read(&sig->count));
+
+ rcu_read_lock();
+ sighand = rcu_dereference(tsk->sighand);
+ spin_lock(&sighand->siglock);
- REMOVE_LINKS(p);
+ posix_cpu_timers_exit(tsk);
+ if (atomic_dec_and_test(&sig->count))
+ posix_cpu_timers_exit_group(tsk);
+ else {
+ /*
+ * If there is any task waiting for the group exit
+ * then notify it:
+ */
+ if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+ wake_up_process(sig->group_exit_task);
+ sig->group_exit_task = NULL;
+ }
+ if (tsk == sig->curr_target)
+ sig->curr_target = next_thread(tsk);
+ /*
+ * Accumulate here the counters for all threads but the
+ * group leader as they die, so they can be added into
+ * the process-wide totals when those are taken.
+ * The group leader stays around as a zombie as long
+ * as there are other threads. When it gets reaped,
+ * the exit.c code will add its counts into these totals.
+ * We won't ever get here for the group leader, since it
+ * will have been the last reference on the signal_struct.
+ */
+ sig->utime = cputime_add(sig->utime, tsk->utime);
+ sig->stime = cputime_add(sig->stime, tsk->stime);
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->sched_time += tsk->sched_time;
+ sig = NULL; /* Marker for below. */
+ }
+
+ __unhash_process(tsk);
+
+ tsk->signal = NULL;
+ tsk->sighand = NULL;
+ spin_unlock(&sighand->siglock);
+ rcu_read_unlock();
+
+ __cleanup_sighand(sighand);
+ clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ flush_sigqueue(&tsk->pending);
+ if (sig) {
+ flush_sigqueue(&sig->shared_pending);
+ __cleanup_signal(sig);
+ }
}
void release_task(struct task_struct * p)
@@ -65,21 +133,14 @@ void release_task(struct task_struct * p)
task_t *leader;
struct dentry *proc_dentry;
-repeat:
+repeat:
atomic_dec(&p->user->processes);
spin_lock(&p->proc_lock);
proc_dentry = proc_pid_unhash(p);
write_lock_irq(&tasklist_lock);
- if (unlikely(p->ptrace))
- __ptrace_unlink(p);
+ ptrace_unlink(p);
BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
__exit_signal(p);
- /*
- * Note that the fastpath in sys_times depends on __exit_signal having
- * updated the counters before a task is removed from the tasklist of
- * the process by __unhash_process.
- */
- __unhash_process(p);
/*
* If we are the last non-leader member of the thread
@@ -114,21 +175,6 @@ repeat:
goto repeat;
}
-/* we are using it only for SMP init */
-
-void unhash_process(struct task_struct *p)
-{
- struct dentry *proc_dentry;
-
- spin_lock(&p->proc_lock);
- proc_dentry = proc_pid_unhash(p);
- write_lock_irq(&tasklist_lock);
- __unhash_process(p);
- write_unlock_irq(&tasklist_lock);
- spin_unlock(&p->proc_lock);
- proc_pid_flush(proc_dentry);
-}
-
/*
* This checks not only the pgrp, but falls back on the pid if no
* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +282,10 @@ static void reparent_to_init(void)
ptrace_unlink(current);
/* Reparent to init */
- REMOVE_LINKS(current);
+ remove_parent(current);
current->parent = child_reaper;
current->real_parent = child_reaper;
- SET_LINKS(current);
+ add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
@@ -345,9 +391,9 @@ void daemonize(const char *name, ...)
exit_mm(current);
set_special_pids(1, 1);
- down(&tty_sem);
+ mutex_lock(&tty_mutex);
current->signal->tty = NULL;
- up(&tty_sem);
+ mutex_unlock(&tty_mutex);
/* Block and flush all signals */
sigfillset(&blocked);
@@ -536,13 +582,13 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}
-static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+static inline void choose_new_parent(task_t *p, task_t *reaper)
{
/*
* Make sure we're not reparenting to ourselves and that
* the parent is not a zombie.
*/
- BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+ BUG_ON(p == reaper || reaper->exit_state);
p->real_parent = reaper;
}
@@ -567,9 +613,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
* anyway, so let go of it.
*/
p->ptrace = 0;
- list_del_init(&p->sibling);
+ remove_parent(p);
p->parent = p->real_parent;
- list_add_tail(&p->sibling, &p->parent->children);
+ add_parent(p);
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
@@ -643,7 +689,7 @@ static void forget_original_parent(struct task_struct * father,
if (father == p->real_parent) {
/* reparent with a reaper, real father it's us */
- choose_new_parent(p, reaper, child_reaper);
+ choose_new_parent(p, reaper);
reparent_thread(p, father, 0);
} else {
/* reparent ptraced task to its real parent */
@@ -664,7 +710,7 @@ static void forget_original_parent(struct task_struct * father,
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
p = list_entry(_p,struct task_struct,ptrace_list);
- choose_new_parent(p, reaper, child_reaper);
+ choose_new_parent(p, reaper);
reparent_thread(p, father, 1);
}
}
@@ -805,10 +851,8 @@ fastcall NORET_TYPE void do_exit(long code)
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
- if (unlikely(tsk->pid == 1))
+ if (unlikely(tsk == child_reaper))
panic("Attempted to kill init!");
- if (tsk->io_context)
- exit_io_context();
if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
current->ptrace_message = code;
@@ -822,6 +866,8 @@ fastcall NORET_TYPE void do_exit(long code)
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
+ if (tsk->io_context)
+ exit_io_context();
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
@@ -852,6 +898,12 @@ fastcall NORET_TYPE void do_exit(long code)
exit_itimers(tsk->signal);
acct_process(code);
}
+ if (unlikely(tsk->robust_list))
+ exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list))
+ compat_exit_robust_list(tsk);
+#endif
exit_mm(tsk);
exit_sem(tsk);
@@ -881,6 +933,9 @@ fastcall NORET_TYPE void do_exit(long code)
*/
mutex_debug_check_no_locks_held(tsk);
+ if (tsk->io_context)
+ exit_io_context();
+
/* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
BUG_ON(tsk->flags & PF_DEAD);
@@ -909,13 +964,6 @@ asmlinkage long sys_exit(int error_code)
do_exit((error_code&0xff)<<8);
}
-task_t fastcall *next_thread(const task_t *p)
-{
- return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-
-EXPORT_SYMBOL(next_thread);
-
/*
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
@@ -930,7 +978,6 @@ do_group_exit(int exit_code)
else if (!thread_group_empty(current)) {
struct signal_struct *const sig = current->signal;
struct sighand_struct *const sighand = current->sighand;
- read_lock(&tasklist_lock);
spin_lock_irq(&sighand->siglock);
if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
@@ -940,7 +987,6 @@ do_group_exit(int exit_code)
zap_other_threads(current);
}
spin_unlock_irq(&sighand->siglock);
- read_unlock(&tasklist_lock);
}
do_exit(exit_code);
@@ -1270,7 +1316,7 @@ bail_ref:
/* move to end of parent's list to avoid starvation */
remove_parent(p);
- add_parent(p, p->parent);
+ add_parent(p);
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca497..b3f7a1bb5e55 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
#endif
/* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
/* SLAB cache for sighand_struct structures (tsk->sighand) */
kmem_cache_t *sighand_cachep;
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
atomic_set(&tsk->fs_excl, 0);
+ tsk->btrace_seq = 0;
return tsk;
}
@@ -607,12 +608,12 @@ static struct files_struct *alloc_files(void)
atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock);
+ newf->next_fd = 0;
fdt = &newf->fdtab;
- fdt->next_fd = 0;
fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->max_fdset = __FD_SETSIZE;
- fdt->close_on_exec = &newf->close_on_exec_init;
- fdt->open_fds = &newf->open_fds_init;
+ fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+ fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+ fdt->open_fds = (fd_set *)&newf->open_fds_init;
fdt->fd = &newf->fd_array[0];
INIT_RCU_HEAD(&fdt->rcu);
fdt->free_files = NULL;
@@ -768,8 +769,7 @@ int unshare_files(void)
struct files_struct *files = current->files;
int rc;
- if(!files)
- BUG();
+ BUG_ON(!files);
/* This can race but the race causes us to copy when we don't
need to and drop the copy */
@@ -786,14 +786,6 @@ int unshare_files(void)
EXPORT_SYMBOL(unshare_files);
-void sighand_free_cb(struct rcu_head *rhp)
-{
- struct sighand_struct *sp;
-
- sp = container_of(rhp, struct sighand_struct, rcu);
- kmem_cache_free(sighand_cachep, sp);
-}
-
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct sighand_struct *sig;
@@ -806,12 +798,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
- spin_lock_init(&sig->siglock);
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
}
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+ if (atomic_dec_and_test(&sighand->count))
+ kmem_cache_free(sighand_cachep, sighand);
+}
+
static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
{
struct signal_struct *sig;
@@ -847,7 +844,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
- sig->real_timer.data = tsk;
+ sig->tsk = tsk;
sig->it_virt_expires = cputime_zero;
sig->it_virt_incr = cputime_zero;
@@ -881,6 +878,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
return 0;
}
+void __cleanup_signal(struct signal_struct *sig)
+{
+ exit_thread_group_keys(sig);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+
+ atomic_dec(&sig->live);
+
+ if (atomic_dec_and_test(&sig->count))
+ __cleanup_signal(sig);
+}
+
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
@@ -1020,6 +1033,7 @@ static task_t *copy_process(unsigned long clone_flags,
p->mempolicy = NULL;
goto bad_fork_cleanup_cpuset;
}
+ mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1060,7 +1074,10 @@ static task_t *copy_process(unsigned long clone_flags,
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
-
+ p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+ p->compat_robust_list = NULL;
+#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1091,6 +1108,7 @@ static task_t *copy_process(unsigned long clone_flags,
* We dont wake it up yet.
*/
p->group_leader = p;
+ INIT_LIST_HEAD(&p->thread_group);
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
@@ -1114,16 +1132,6 @@ static task_t *copy_process(unsigned long clone_flags,
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());
- /*
- * Check for pending SIGKILL! The new thread should not be allowed
- * to slip out of an OOM kill. (or normal SIGKILL.)
- */
- if (sigismember(&current->pending.signal, SIGKILL)) {
- write_unlock_irq(&tasklist_lock);
- retval = -EINTR;
- goto bad_fork_cleanup_namespace;
- }
-
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
p->real_parent = current->real_parent;
@@ -1132,6 +1140,23 @@ static task_t *copy_process(unsigned long clone_flags,
p->parent = p->real_parent;
spin_lock(&current->sighand->siglock);
+
+ /*
+ * Process group and session signals need to be delivered to just the
+ * parent before the fork or both the parent and the child after the
+ * fork. Restart if a signal comes in before we add the new process to
+ * it's process group.
+ * A fatal signal pending means that current will exit, so the new
+ * thread can't slip out of an OOM kill (or normal SIGKILL).
+ */
+ recalc_sigpending();
+ if (signal_pending(current)) {
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+ retval = -ERESTARTNOINTR;
+ goto bad_fork_cleanup_namespace;
+ }
+
if (clone_flags & CLONE_THREAD) {
/*
* Important: if an exit-all has been started then
@@ -1144,17 +1169,9 @@ static task_t *copy_process(unsigned long clone_flags,
retval = -EAGAIN;
goto bad_fork_cleanup_namespace;
}
- p->group_leader = current->group_leader;
- if (current->signal->group_stop_count > 0) {
- /*
- * There is an all-stop in progress for the group.
- * We ourselves will stop as soon as we check signals.
- * Make the new thread part of that group stop too.
- */
- current->signal->group_stop_count++;
- set_tsk_thread_flag(p, TIF_SIGPENDING);
- }
+ p->group_leader = current->group_leader;
+ list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
if (!cputime_eq(current->signal->it_virt_expires,
cputime_zero) ||
@@ -1177,23 +1194,25 @@ static task_t *copy_process(unsigned long clone_flags,
*/
p->ioprio = current->ioprio;
- SET_LINKS(p);
- if (unlikely(p->ptrace & PT_PTRACED))
- __ptrace_link(p, current->parent);
-
- if (thread_group_leader(p)) {
- p->signal->tty = current->signal->tty;
- p->signal->pgrp = process_group(current);
- p->signal->session = current->signal->session;
- attach_pid(p, PIDTYPE_PGID, process_group(p));
- attach_pid(p, PIDTYPE_SID, p->signal->session);
- if (p->pid)
+ if (likely(p->pid)) {
+ add_parent(p);
+ if (unlikely(p->ptrace & PT_PTRACED))
+ __ptrace_link(p, current->parent);
+
+ if (thread_group_leader(p)) {
+ p->signal->tty = current->signal->tty;
+ p->signal->pgrp = process_group(current);
+ p->signal->session = current->signal->session;
+ attach_pid(p, PIDTYPE_PGID, process_group(p));
+ attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+ list_add_tail(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
+ }
+ attach_pid(p, PIDTYPE_PID, p->pid);
+ nr_threads++;
}
- attach_pid(p, PIDTYPE_TGID, p->tgid);
- attach_pid(p, PIDTYPE_PID, p->pid);
- nr_threads++;
total_forks++;
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
@@ -1208,9 +1227,9 @@ bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
- exit_signal(p);
+ cleanup_signal(p);
bad_fork_cleanup_sighand:
- exit_sighand(p);
+ __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
@@ -1257,7 +1276,7 @@ task_t * __devinit fork_idle(int cpu)
if (!task)
return ERR_PTR(-ENOMEM);
init_idle(task, cpu);
- unhash_process(task);
+
return task;
}
@@ -1349,11 +1368,21 @@ long do_fork(unsigned long clone_flags,
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct sighand_struct *sighand = data;
+
+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ spin_lock_init(&sighand->siglock);
+}
+
void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+ sighand_ctor, NULL);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
@@ -1534,6 +1563,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
check_unshare_flags(&unshare_flags);
+ /* Return -EINVAL for all unsupported flags */
+ err = -EINVAL;
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+ goto bad_unshare_out;
+
if ((err = unshare_thread(unshare_flags)))
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..9c9b2b6b22dd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
* Removed page pinning, fix privately mapped COW pages and other cleanups
* (C) Copyright 2003, 2004 Jamie Lokier
*
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
goto out;
}
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+ size_t len)
+{
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+ size_t __user *len_ptr)
+{
+ struct robust_list_head *head;
+ unsigned long ret;
+
+ if (!pid)
+ head = current->robust_list;
+ else {
+ struct task_struct *p;
+
+ ret = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto err_unlock;
+ ret = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_PTRACE))
+ goto err_unlock;
+ head = p->robust_list;
+ read_unlock(&tasklist_lock);
+ }
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+{
+ u32 uval;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ if ((uval & FUTEX_TID_MASK) == curr->pid) {
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+ uval | FUTEX_OWNER_DIED) != uval)
+ goto retry;
+
+ if (uval & FUTEX_WAITERS)
+ futex_wake((unsigned long)uaddr, 1);
+ }
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT;
+ unsigned long futex_offset;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (get_user(entry, &head->list.next))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (get_user(pending, &head->list_op_pending))
+ return;
+ if (pending)
+ handle_futex_death((void *)pending + futex_offset, curr);
+
+ while (entry != &head->list) {
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending)
+ if (handle_futex_death((void *)entry + futex_offset,
+ curr))
+ return;
+ /*
+ * Fetch the next entry in the list:
+ */
+ if (get_user(entry, &entry->next))
+ return;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+}
+
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
unsigned long uaddr2, int val2, int val3)
{
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000000..54274fc85321
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *pending;
+ compat_uptr_t uentry, upending;
+ unsigned int limit = ROBUST_LIST_LIMIT;
+ compat_long_t futex_offset;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (get_user(uentry, &head->list.next))
+ return;
+ entry = compat_ptr(uentry);
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (get_user(upending, &head->list_op_pending))
+ return;
+ pending = compat_ptr(upending);
+ if (upending)
+ handle_futex_death((void *)pending + futex_offset, curr);
+
+ while (compat_ptr(uentry) != &head->list) {
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending)
+ if (handle_futex_death((void *)entry + futex_offset,
+ curr))
+ return;
+
+ /*
+ * Fetch the next entry in the list:
+ */
+ if (get_user(uentry, (compat_uptr_t *)&entry->next))
+ return;
+ entry = compat_ptr(uentry);
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+ compat_size_t len)
+{
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+ compat_size_t __user *len_ptr)
+{
+ struct compat_robust_list_head *head;
+ unsigned long ret;
+
+ if (!pid)
+ head = current->compat_robust_list;
+ else {
+ struct task_struct *p;
+
+ ret = -ESRCH;
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if (!p)
+ goto err_unlock;
+ ret = -EPERM;
+ if ((current->euid != p->euid) && (current->euid != p->uid) &&
+ !capable(CAP_SYS_PTRACE))
+ goto err_unlock;
+ head = p->compat_robust_list;
+ read_unlock(&tasklist_lock);
+ }
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+ struct compat_timespec __user *utime, u32 __user *uaddr2,
+ u32 val3)
+{
+ struct timespec t;
+ unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ int val2 = 0;
+
+ if ((op == FUTEX_WAIT) && utime) {
+ if (get_compat_timespec(&t, utime))
+ return -EFAULT;
+ timeout = timespec_to_jiffies(&t) + 1;
+ }
+ if (op >= FUTEX_REQUEUE)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex((unsigned long)uaddr, op, val, timeout,
+ (unsigned long)uaddr2, val2, val3);
+}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14bc9cfa6399..0237a556eb1f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts)
EXPORT_SYMBOL_GPL(ktime_get_ts);
/*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+{
+ ktime_t xtim, tomono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ xtim = timespec_to_ktime(xtime);
+ tomono = timespec_to_ktime(wall_to_monotonic);
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ base[CLOCK_REALTIME].softirq_time = xtim;
+ base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+}
+
+/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
/*
* Divide a ktime value by a nanosecond value
*/
-static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+static unsigned long ktime_divns(const ktime_t kt, s64 div)
{
u64 dclc, inc, dns;
int sft = 0;
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* hrtimer_forward - forward the timer expiry
*
* @timer: hrtimer to forward
+ * @now: forward past this time
* @interval: the interval to forward
*
* Forward the timer expiry so it will expire in the future.
* Returns the number of overruns.
*/
unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
unsigned long orun = 1;
- ktime_t delta, now;
-
- now = timer->base->get_time();
+ ktime_t delta;
delta = ktime_sub(now, timer->expires);
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval)
interval.tv64 = timer->base->resolution.tv64;
if (unlikely(delta.tv64 >= interval.tv64)) {
- nsec_t incr = ktime_to_ns(interval);
+ s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
rb_link_node(&timer->node, parent, link);
rb_insert_color(&timer->node, &base->active);
- timer->state = HRTIMER_PENDING;
-
if (!base->first || timer->expires.tv64 <
rb_entry(base->first, struct hrtimer, node)->expires.tv64)
base->first = &timer->node;
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
if (base->first == &timer->node)
base->first = rb_next(&timer->node);
rb_erase(&timer->node, &base->active);
+ timer->node.rb_parent = HRTIMER_INACTIVE;
}
/*
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
{
if (hrtimer_active(timer)) {
__remove_hrtimer(timer, base);
- timer->state = HRTIMER_INACTIVE;
return 1;
}
return 0;
@@ -560,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &bases[clock_id];
+ timer->node.rb_parent = HRTIMER_INACTIVE;
}
/**
@@ -586,48 +604,35 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
*/
static inline void run_hrtimer_queue(struct hrtimer_base *base)
{
- ktime_t now = base->get_time();
struct rb_node *node;
+ if (base->get_softirq_time)
+ base->softirq_time = base->get_softirq_time();
+
spin_lock_irq(&base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
- int (*fn)(void *);
+ int (*fn)(struct hrtimer *);
int restart;
- void *data;
timer = rb_entry(node, struct hrtimer, node);
- if (now.tv64 <= timer->expires.tv64)
+ if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
fn = timer->function;
- data = timer->data;
set_curr_timer(base, timer);
- timer->state = HRTIMER_RUNNING;
__remove_hrtimer(timer, base);
spin_unlock_irq(&base->lock);
- /*
- * fn == NULL is special case for the simplest timer
- * variant - wake up process and do not restart:
- */
- if (!fn) {
- wake_up_process(data);
- restart = HRTIMER_NORESTART;
- } else
- restart = fn(data);
+ restart = fn(timer);
spin_lock_irq(&base->lock);
- /* Another CPU has added back the timer */
- if (timer->state != HRTIMER_RUNNING)
- continue;
-
- if (restart == HRTIMER_RESTART)
+ if (restart != HRTIMER_NORESTART) {
+ BUG_ON(hrtimer_active(timer));
enqueue_hrtimer(timer, base);
- else
- timer->state = HRTIMER_EXPIRED;
+ }
}
set_curr_timer(base, NULL);
spin_unlock_irq(&base->lock);
@@ -641,6 +646,8 @@ void hrtimer_run_queues(void)
struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
int i;
+ hrtimer_get_softirq_time(base);
+
for (i = 0; i < MAX_HRTIMER_BASES; i++)
run_hrtimer_queue(&base[i]);
}
@@ -649,79 +656,70 @@ void hrtimer_run_queues(void)
* Sleep related functions:
*/
-/**
- * schedule_hrtimer - sleep until timeout
- *
- * @timer: hrtimer variable initialized with the correct clock base
- * @mode: timeout value is abs/rel
- *
- * Make the current task sleep until @timeout is
- * elapsed.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * will be returned
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- */
-static ktime_t __sched
-schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
-{
- /* fn stays NULL, meaning single-shot wakeup: */
- timer->data = current;
+struct sleep_hrtimer {
+ struct hrtimer timer;
+ struct task_struct *task;
+ int expired;
+};
- hrtimer_start(timer, timer->expires, mode);
+static int nanosleep_wakeup(struct hrtimer *timer)
+{
+ struct sleep_hrtimer *t =
+ container_of(timer, struct sleep_hrtimer, timer);
- schedule();
- hrtimer_cancel(timer);
+ t->expired = 1;
+ wake_up_process(t->task);
- /* Return the remaining time: */
- if (timer->state != HRTIMER_EXPIRED)
- return ktime_sub(timer->expires, timer->base->get_time());
- else
- return (ktime_t) {.tv64 = 0 };
+ return HRTIMER_NORESTART;
}
-static inline ktime_t __sched
-schedule_hrtimer_interruptible(struct hrtimer *timer,
- const enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
{
- set_current_state(TASK_INTERRUPTIBLE);
+ t->timer.function = nanosleep_wakeup;
+ t->task = current;
+ t->expired = 0;
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start(&t->timer, t->timer.expires, mode);
+
+ schedule();
+
+ if (unlikely(!t->expired)) {
+ hrtimer_cancel(&t->timer);
+ mode = HRTIMER_ABS;
+ }
+ } while (!t->expired && !signal_pending(current));
- return schedule_hrtimer(timer, mode);
+ return t->expired;
}
static long __sched nanosleep_restart(struct restart_block *restart)
{
+ struct sleep_hrtimer t;
struct timespec __user *rmtp;
struct timespec tu;
- void *rfn_save = restart->fn;
- struct hrtimer timer;
- ktime_t rem;
+ ktime_t time;
restart->fn = do_no_restart_syscall;
- hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
-
- timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
-
- rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
+ hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+ t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
- if (rem.tv64 <= 0)
+ if (do_nanosleep(&t, HRTIMER_ABS))
return 0;
rmtp = (struct timespec __user *) restart->arg2;
- tu = ktime_to_timespec(rem);
- if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
- return -EFAULT;
+ if (rmtp) {
+ time = ktime_sub(t.timer.expires, t.timer.base->get_time());
+ if (time.tv64 <= 0)
+ return 0;
+ tu = ktime_to_timespec(time);
+ if (copy_to_user(rmtp, &tu, sizeof(tu)))
+ return -EFAULT;
+ }
- restart->fn = rfn_save;
+ restart->fn = nanosleep_restart;
/* The other values in restart are already filled in */
return -ERESTART_RESTARTBLOCK;
@@ -731,33 +729,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
- struct hrtimer timer;
+ struct sleep_hrtimer t;
struct timespec tu;
ktime_t rem;
- hrtimer_init(&timer, clockid, mode);
-
- timer.expires = timespec_to_ktime(*rqtp);
-
- rem = schedule_hrtimer_interruptible(&timer, mode);
- if (rem.tv64 <= 0)
+ hrtimer_init(&t.timer, clockid, mode);
+ t.timer.expires = timespec_to_ktime(*rqtp);
+ if (do_nanosleep(&t, mode))
return 0;
/* Absolute timers do not update the rmtp value and restart: */
if (mode == HRTIMER_ABS)
return -ERESTARTNOHAND;
- tu = ktime_to_timespec(rem);
-
- if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
- return -EFAULT;
+ if (rmtp) {
+ rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
+ if (rem.tv64 <= 0)
+ return 0;
+ tu = ktime_to_timespec(rem);
+ if (copy_to_user(rmtp, &tu, sizeof(tu)))
+ return -EFAULT;
+ }
restart = &current_thread_info()->restart_block;
restart->fn = nanosleep_restart;
- restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg1 = timer.expires.tv64 >> 32;
+ restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
+ restart->arg1 = t.timer.expires.tv64 >> 32;
restart->arg2 = (unsigned long) rmtp;
- restart->arg3 = (unsigned long) timer.base->index;
+ restart->arg3 = (unsigned long) t.timer.base->index;
return -ERESTART_RESTARTBLOCK;
}
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff5e..2b33f852be3e 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,4 @@
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o migration.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
-
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 97d5559997d2..ac766ad573e8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
p = &desc->action;
if ((old = *p) != NULL) {
/* Can't share interrupts unless both agree to */
- if (!(old->flags & new->flags & SA_SHIRQ)) {
- spin_unlock_irqrestore(&desc->lock,flags);
- return -EBUSY;
- }
+ if (!(old->flags & new->flags & SA_SHIRQ))
+ goto mismatch;
+
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+ /* All handlers must agree on per-cpuness */
+ if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
+ goto mismatch;
+#endif
/* add new interrupt at end of irq queue */
do {
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
}
*p = new;
-
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+ if (new->flags & SA_PERCPU_IRQ)
+ desc->status |= IRQ_PER_CPU;
+#endif
if (!shared) {
desc->depth = 0;
desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -236,6 +243,12 @@ int setup_irq(unsigned int irq, struct irqaction * new)
register_handler_proc(irq, new);
return 0;
+
+mismatch:
+ spin_unlock_irqrestore(&desc->lock, flags);
+ printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+ dump_stack();
+ return -EBUSY;
}
/**
@@ -258,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id)
struct irqaction **p;
unsigned long flags;
+ WARN_ON(in_interrupt());
if (irq >= NR_IRQS)
return;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 000000000000..52a8655fa080
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,65 @@
+#include <linux/irq.h>
+
+#if defined(CONFIG_GENERIC_PENDING_IRQ)
+
+void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+ irq_desc_t *desc = irq_desc + irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->move_irq = 1;
+ pending_irq_cpumask[irq] = mask;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void move_native_irq(int irq)
+{
+ cpumask_t tmp;
+ irq_desc_t *desc = irq_descp(irq);
+
+ if (likely(!desc->move_irq))
+ return;
+
+ /*
+ * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+ */
+ if (CHECK_IRQ_PER_CPU(desc->status)) {
+ WARN_ON(1);
+ return;
+ }
+
+ desc->move_irq = 0;
+
+ if (likely(cpus_empty(pending_irq_cpumask[irq])))
+ return;
+
+ if (!desc->handler->set_affinity)
+ return;
+
+ assert_spin_locked(&desc->lock);
+
+ cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+ /*
+ * If there was a valid mask to work with, please
+ * do the disable, re-program, enable sequence.
+ * This is *not* particularly important for level triggered
+ * but in a edge trigger case, we might be setting rte
+ * when an active trigger is comming in. This could
+ * cause some ioapics to mal-function.
+ * Being paranoid i guess!
+ */
+ if (unlikely(!cpus_empty(tmp))) {
+ if (likely(!(desc->status & IRQ_DISABLED)))
+ desc->handler->disable(irq);
+
+ desc->handler->set_affinity(irq,tmp);
+
+ if (likely(!(desc->status & IRQ_DISABLED)))
+ desc->handler->enable(irq);
+ }
+ cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#endif
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c84c..204ed7939e75 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
/*
* The timer is automagically restarted, when interval != 0
*/
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
{
- struct task_struct *tsk = (struct task_struct *) data;
+ struct signal_struct *sig =
+ container_of(timer, struct signal_struct, real_timer);
- send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
- if (tsk->signal->it_real_incr.tv64 != 0) {
- hrtimer_forward(&tsk->signal->real_timer,
- tsk->signal->it_real_incr);
+ send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
+ if (sig->it_real_incr.tv64 != 0) {
+ hrtimer_forward(timer, timer->base->softirq_time,
+ sig->it_real_incr);
return HRTIMER_RESTART;
}
return HRTIMER_NORESTART;
}
+/*
+ * We do not care about correctness. We just sanitize the values so
+ * the ktime_t operations which expect normalized values do not
+ * break. This converts negative values to long timeouts similar to
+ * the code in kernel versions < 2.6.16
+ *
+ * Print a limited number of warning messages when an invalid timeval
+ * is detected.
+ */
+static void fixup_timeval(struct timeval *tv, int interval)
+{
+ static int warnlimit = 10;
+ unsigned long tmp;
+
+ if (warnlimit > 0) {
+ warnlimit--;
+ printk(KERN_WARNING
+ "setitimer: %s (pid = %d) provided "
+ "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
+ current->comm, current->pid,
+ interval ? "it_interval" : "it_value",
+ tv->tv_sec, (long) tv->tv_usec);
+ }
+
+ tmp = tv->tv_usec;
+ if (tmp >= USEC_PER_SEC) {
+ tv->tv_usec = tmp % USEC_PER_SEC;
+ tv->tv_sec += tmp / USEC_PER_SEC;
+ }
+
+ tmp = tv->tv_sec;
+ if (tmp > LONG_MAX)
+ tv->tv_sec = LONG_MAX;
+}
+
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+
+/*
+ * Check for invalid timevals, sanitize them and print a limited
+ * number of warnings.
+ */
+static void check_itimerval(struct itimerval *value) {
+
+ if (unlikely(!timeval_valid(&value->it_value)))
+ fixup_timeval(&value->it_value, 0);
+
+ if (unlikely(!timeval_valid(&value->it_interval)))
+ fixup_timeval(&value->it_interval, 1);
+}
+
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
ktime_t expires;
cputime_t cval, cinterval, nval, ninterval;
+ /*
+ * Validate the timevals in value.
+ *
+ * Note: Although the spec requires that invalid values shall
+ * return -EINVAL, we just fixup the value and print a limited
+ * number of warnings in order not to break users of this
+ * historical misfeature.
+ *
+ * Scheduled for replacement in March 2007
+ */
+ check_itimerval(value);
+
switch (which) {
case ITIMER_REAL:
again:
@@ -226,6 +292,43 @@ again:
return 0;
}
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds: number of seconds until alarm
+ * 0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+ if (seconds > INT_MAX)
+ seconds = INT_MAX;
+#endif
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+ /*
+ * We can't return 0 if we have an alarm pending ... And we'd
+ * better return too much than too little anyway
+ */
+ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+ it_old.it_value.tv_usec >= 500000)
+ it_old.it_value.tv_sec++;
+
+ return it_old.it_value.tv_sec;
+}
+
asmlinkage long sys_setitimer(int which,
struct itimerval __user *value,
struct itimerval __user *ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063aaa..20a997c73c3d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
sa.sa.sa_handler = SIG_IGN;
sa.sa.sa_flags = 0;
siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+ do_sigaction(SIGCHLD, &sa, NULL);
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73ce..1156eb0977d0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
}
/*
- * This function is called from exit_thread or flush_thread when task tk's
- * stack is being recycled so that we can recycle any function-return probe
- * instances associated with this task. These left over instances represent
- * probed functions that have been called but will never return.
+ * This function is called from finish_task_switch when task tk becomes dead,
+ * so that we can recycle any function-return probe instances associated
+ * with this task. These left over instances represent probed functions
+ * that have been called but will never return.
*/
void __kprobes kprobe_flush_task(struct task_struct *tk)
{
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
unsigned long flags = 0;
spin_lock_irqsave(&kretprobe_lock, flags);
- head = kretprobe_inst_table_head(current);
+ head = kretprobe_inst_table_head(tk);
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri);
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
}
p->nmissed = 0;
- down(&kprobe_mutex);
+ mutex_lock(&kprobe_mutex);
old_p = get_kprobe(p->addr);
if (old_p) {
ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
arch_arm_kprobe(p);
out:
- up(&kprobe_mutex);
+ mutex_unlock(&kprobe_mutex);
if (ret && probed_mod)
module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
struct kprobe *old_p, *list_p;
int cleanup_p;
- down(&kprobe_mutex);
+ mutex_lock(&kprobe_mutex);
old_p = get_kprobe(p->addr);
if (unlikely(!old_p)) {
- up(&kprobe_mutex);
+ mutex_unlock(&kprobe_mutex);
return;
}
if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid_p;
- up(&kprobe_mutex);
+ mutex_unlock(&kprobe_mutex);
return;
}
valid_p:
@@ -523,7 +523,7 @@ valid_p:
cleanup_p = 0;
}
- up(&kprobe_mutex);
+ mutex_unlock(&kprobe_mutex);
synchronize_sched();
if (p->mod_refcounted &&
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d5eeae0fa5bc..f119e098e67b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,9 +15,6 @@
#include <linux/module.h>
#include <linux/init.h>
-u64 uevent_seqnum;
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
-
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -25,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
static struct subsys_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
{
@@ -55,7 +52,7 @@ decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
static struct attribute * kernel_attrs[] = {
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
&uevent_seqnum_attr.attr,
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a1092c..c5f3c6613b6d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <asm/semaphore.h>
/*
@@ -41,7 +42,7 @@ struct kthread_stop_info
/* Thread stopping is done by setthing this var: lock serializes
* multiple kthread_stop calls. */
-static DECLARE_MUTEX(kthread_stop_lock);
+static DEFINE_MUTEX(kthread_stop_lock);
static struct kthread_stop_info kthread_stop_info;
int kthread_should_stop(void)
@@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create)
create->result = ERR_PTR(pid);
} else {
wait_for_completion(&create->started);
+ read_lock(&tasklist_lock);
create->result = find_task_by_pid(pid);
+ read_unlock(&tasklist_lock);
}
complete(&create->done);
}
@@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
{
int ret;
- down(&kthread_stop_lock);
+ mutex_lock(&kthread_stop_lock);
/* It could exit after stop_info.k set, but before wake_up_process. */
get_task_struct(k);
@@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
wait_for_completion(&kthread_stop_info.done);
kthread_stop_info.k = NULL;
ret = kthread_stop_info.err;
- up(&kthread_stop_lock);
+ mutex_unlock(&kthread_stop_lock);
return ret;
}
diff --git a/kernel/module.c b/kernel/module.c
index 5aad477ddc79..bd088a7c1499 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
#include <linux/device.h>
#include <linux/string.h>
#include <linux/sched.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
@@ -60,29 +61,20 @@
static DEFINE_SPINLOCK(modlist_lock);
/* List of modules, protected by module_mutex AND modlist_lock */
-static DECLARE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
-static DECLARE_MUTEX(notify_mutex);
-static struct notifier_block * module_notify_list;
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
int register_module_notifier(struct notifier_block * nb)
{
- int err;
- down(&notify_mutex);
- err = notifier_chain_register(&module_notify_list, nb);
- up(&notify_mutex);
- return err;
+ return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);
int unregister_module_notifier(struct notifier_block * nb)
{
- int err;
- down(&notify_mutex);
- err = notifier_chain_unregister(&module_notify_list, nb);
- up(&notify_mutex);
- return err;
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
EXPORT_SYMBOL(unregister_module_notifier);
@@ -126,15 +118,30 @@ extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
extern const unsigned long __start___kcrctab[];
extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
#else
-#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ const struct kernel_symbol *ks = start;
+ for (; ks < stop; ks++)
+ if (strcmp(ks->name, name) == 0)
+ return ks;
+ return NULL;
+}
+
/* Find a symbol, return value, crc and module which owns it */
static unsigned long __find_symbol(const char *name,
struct module **owner,
@@ -142,64 +149,81 @@ static unsigned long __find_symbol(const char *name,
int gplok)
{
struct module *mod;
- unsigned int i;
+ const struct kernel_symbol *ks;
/* Core kernel first. */
*owner = NULL;
- for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
- if (strcmp(__start___ksymtab[i].name, name) == 0) {
- *crc = symversion(__start___kcrctab, i);
- return __start___ksymtab[i].value;
- }
+ ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+ if (ks) {
+ *crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
+ return ks->value;
}
if (gplok) {
- for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
- if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
- *crc = symversion(__start___kcrctab_gpl, i);
- return __start___ksymtab_gpl[i].value;
- }
+ ks = lookup_symbol(name, __start___ksymtab_gpl,
+ __stop___ksymtab_gpl);
+ if (ks) {
+ *crc = symversion(__start___kcrctab_gpl,
+ (ks - __start___ksymtab_gpl));
+ return ks->value;
+ }
+ }
+ ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+ __stop___ksymtab_gpl_future);
+ if (ks) {
+ if (!gplok) {
+ printk(KERN_WARNING "Symbol %s is being used "
+ "by a non-GPL module, which will not "
+ "be allowed in the future\n", name);
+ printk(KERN_WARNING "Please see the file "
+ "Documentation/feature-removal-schedule.txt "
+ "in the kernel source tree for more "
+ "details.\n");
+ }
+ *crc = symversion(__start___kcrctab_gpl_future,
+ (ks - __start___ksymtab_gpl_future));
+ return ks->value;
}
/* Now try modules. */
list_for_each_entry(mod, &modules, list) {
*owner = mod;
- for (i = 0; i < mod->num_syms; i++)
- if (strcmp(mod->syms[i].name, name) == 0) {
- *crc = symversion(mod->crcs, i);
- return mod->syms[i].value;
- }
+ ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+ if (ks) {
+ *crc = symversion(mod->crcs, (ks - mod->syms));
+ return ks->value;
+ }
if (gplok) {
- for (i = 0; i < mod->num_gpl_syms; i++) {
- if (strcmp(mod->gpl_syms[i].name, name) == 0) {
- *crc = symversion(mod->gpl_crcs, i);
- return mod->gpl_syms[i].value;
- }
+ ks = lookup_symbol(name, mod->gpl_syms,
+ mod->gpl_syms + mod->num_gpl_syms);
+ if (ks) {
+ *crc = symversion(mod->gpl_crcs,
+ (ks - mod->gpl_syms));
+ return ks->value;
+ }
+ }
+ ks = lookup_symbol(name, mod->gpl_future_syms,
+ (mod->gpl_future_syms +
+ mod->num_gpl_future_syms));
+ if (ks) {
+ if (!gplok) {
+ printk(KERN_WARNING "Symbol %s is being used "
+ "by a non-GPL module, which will not "
+ "be allowed in the future\n", name);
+ printk(KERN_WARNING "Please see the file "
+ "Documentation/feature-removal-schedule.txt "
+ "in the kernel source tree for more "
+ "details.\n");
}
+ *crc = symversion(mod->gpl_future_crcs,
+ (ks - mod->gpl_future_syms));
+ return ks->value;
}
}
DEBUGP("Failed to find symbol %s\n", name);
return 0;
}
-/* Find a symbol in this elf symbol table */
-static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
- unsigned int symindex,
- const char *strtab,
- const char *name)
-{
- unsigned int i;
- Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
-
- /* Search (defined) internal symbols first. */
- for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
- if (sym[i].st_shndx != SHN_UNDEF
- && strcmp(name, strtab + sym[i].st_name) == 0)
- return sym[i].st_value;
- }
- return 0;
-}
-
/* Search for module by name: must hold module_mutex. */
static struct module *find_module(const char *name)
{
@@ -379,7 +403,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_MODULE_UNLOAD
#define MODINFO_ATTR(field) \
static void setup_modinfo_##field(struct module *mod, const char *s) \
{ \
@@ -411,12 +434,7 @@ static struct module_attribute modinfo_##field = { \
MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
-static struct module_attribute *modinfo_attrs[] = {
- &modinfo_version,
- &modinfo_srcversion,
- NULL,
-};
-
+#ifdef CONFIG_MODULE_UNLOAD
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
@@ -557,7 +575,7 @@ static void free_module(struct module *mod);
static void wait_for_zero_refcount(struct module *mod)
{
/* Since we might sleep for some time, drop the semaphore first */
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
for (;;) {
DEBUGP("Looking at refcount...\n");
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -566,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod)
schedule();
}
current->state = TASK_RUNNING;
- down(&module_mutex);
+ mutex_lock(&module_mutex);
}
asmlinkage long
@@ -583,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- if (down_interruptible(&module_mutex) != 0)
+ if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
mod = find_module(name);
@@ -632,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
/* Final destruction now noone is using it. */
if (mod->exit != NULL) {
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
mod->exit();
- down(&module_mutex);
+ mutex_lock(&module_mutex);
}
free_module(mod);
out:
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
return ret;
}
@@ -731,138 +749,14 @@ static inline void module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
-#ifdef CONFIG_OBSOLETE_MODPARM
-/* Bounds checking done below */
-static int obsparm_copy_string(const char *val, struct kernel_param *kp)
-{
- strcpy(kp->arg, val);
- return 0;
-}
-
-static int set_obsolete(const char *val, struct kernel_param *kp)
-{
- unsigned int min, max;
- unsigned int size, maxsize;
- int dummy;
- char *endp;
- const char *p;
- struct obsolete_modparm *obsparm = kp->arg;
-
- if (!val) {
- printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
- return -EINVAL;
- }
-
- /* type is: [min[-max]]{b,h,i,l,s} */
- p = obsparm->type;
- min = simple_strtol(p, &endp, 10);
- if (endp == obsparm->type)
- min = max = 1;
- else if (*endp == '-') {
- p = endp+1;
- max = simple_strtol(p, &endp, 10);
- } else
- max = min;
- switch (*endp) {
- case 'b':
- return param_array(kp->name, val, min, max, obsparm->addr,
- 1, param_set_byte, &dummy);
- case 'h':
- return param_array(kp->name, val, min, max, obsparm->addr,
- sizeof(short), param_set_short, &dummy);
- case 'i':
- return param_array(kp->name, val, min, max, obsparm->addr,
- sizeof(int), param_set_int, &dummy);
- case 'l':
- return param_array(kp->name, val, min, max, obsparm->addr,
- sizeof(long), param_set_long, &dummy);
- case 's':
- return param_array(kp->name, val, min, max, obsparm->addr,
- sizeof(char *), param_set_charp, &dummy);
-
- case 'c':
- /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
- and the decl is "char xxx[5][50];" */
- p = endp+1;
- maxsize = simple_strtol(p, &endp, 10);
- /* We check lengths here (yes, this is a hack). */
- p = val;
- while (p[size = strcspn(p, ",")]) {
- if (size >= maxsize)
- goto oversize;
- p += size+1;
- }
- if (size >= maxsize)
- goto oversize;
- return param_array(kp->name, val, min, max, obsparm->addr,
- maxsize, obsparm_copy_string, &dummy);
- }
- printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
- return -EINVAL;
- oversize:
- printk(KERN_ERR
- "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
- return -EINVAL;
-}
-
-static int obsolete_params(const char *name,
- char *args,
- struct obsolete_modparm obsparm[],
- unsigned int num,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- const char *strtab)
-{
- struct kernel_param *kp;
- unsigned int i;
- int ret;
-
- kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
- if (!kp)
- return -ENOMEM;
-
- for (i = 0; i < num; i++) {
- char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
-
- snprintf(sym_name, sizeof(sym_name), "%s%s",
- MODULE_SYMBOL_PREFIX, obsparm[i].name);
-
- kp[i].name = obsparm[i].name;
- kp[i].perm = 000;
- kp[i].set = set_obsolete;
- kp[i].get = NULL;
- obsparm[i].addr
- = (void *)find_local_symbol(sechdrs, symindex, strtab,
- sym_name);
- if (!obsparm[i].addr) {
- printk("%s: falsely claims to have parameter %s\n",
- name, obsparm[i].name);
- ret = -EINVAL;
- goto out;
- }
- kp[i].arg = &obsparm[i];
- }
-
- ret = parse_args(name, args, kp, num, NULL);
- out:
- kfree(kp);
- return ret;
-}
-#else
-static int obsolete_params(const char *name,
- char *args,
- struct obsolete_modparm obsparm[],
- unsigned int num,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- const char *strtab)
-{
- if (num != 0)
- printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
- name);
- return 0;
-}
-#endif /* CONFIG_OBSOLETE_MODPARM */
+static struct module_attribute *modinfo_attrs[] = {
+ &modinfo_version,
+ &modinfo_srcversion,
+#ifdef CONFIG_MODULE_UNLOAD
+ &refcnt,
+#endif
+ NULL,
+};
static const char vermagic[] = VERMAGIC_STRING;
@@ -1056,37 +950,28 @@ static inline void remove_sect_attrs(struct module *mod)
}
#endif /* CONFIG_KALLSYMS */
-
-#ifdef CONFIG_MODULE_UNLOAD
-static inline int module_add_refcnt_attr(struct module *mod)
-{
- return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
- return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-#else
-static inline int module_add_refcnt_attr(struct module *mod)
-{
- return 0;
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_UNLOAD
static int module_add_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
+ struct module_attribute *temp_attr;
int error = 0;
int i;
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (ARRAY_SIZE(modinfo_attrs) + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
if (!attr->test ||
- (attr->test && attr->test(mod)))
- error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+ (attr->test && attr->test(mod))) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ temp_attr->attr.owner = mod;
+ error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+ ++temp_attr;
+ }
}
return error;
}
@@ -1096,12 +981,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
struct module_attribute *attr;
int i;
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
- attr->free(mod);
+ if (attr->free)
+ attr->free(mod);
}
+ kfree(mod->modinfo_attrs);
}
-#endif
static int mod_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
@@ -1119,19 +1008,13 @@ static int mod_sysfs_setup(struct module *mod,
if (err)
goto out;
- err = module_add_refcnt_attr(mod);
- if (err)
- goto out_unreg;
-
err = module_param_sysfs_setup(mod, kparam, num_params);
if (err)
goto out_unreg;
-#ifdef CONFIG_MODULE_UNLOAD
err = module_add_modinfo_attrs(mod);
if (err)
goto out_unreg;
-#endif
return 0;
@@ -1143,10 +1026,7 @@ out:
static void mod_kobject_remove(struct module *mod)
{
-#ifdef CONFIG_MODULE_UNLOAD
module_remove_modinfo_attrs(mod);
-#endif
- module_remove_refcnt_attr(mod);
module_param_sysfs_remove(mod);
kobject_unregister(&mod->mkobj.kobj);
@@ -1424,7 +1304,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
return NULL;
}
-#ifdef CONFIG_MODULE_UNLOAD
static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
unsigned int infoindex)
{
@@ -1439,23 +1318,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
attr->attr.name));
}
}
-#endif
#ifdef CONFIG_KALLSYMS
int is_exported(const char *name, const struct module *mod)
{
- unsigned int i;
-
- if (!mod) {
- for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
- if (strcmp(__start___ksymtab[i].name, name) == 0)
- return 1;
- return 0;
- }
- for (i = 0; i < mod->num_syms; i++)
- if (strcmp(mod->syms[i].name, name) == 0)
+ if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
+ return 1;
+ else
+ if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
return 1;
- return 0;
+ else
+ return 0;
}
/* As per nm */
@@ -1537,8 +1410,8 @@ static struct module *load_module(void __user *umod,
char *secstrings, *args, *modmagic, *strtab = NULL;
unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
exportindex, modindex, obsparmindex, infoindex, gplindex,
- crcindex, gplcrcindex, versindex, pcpuindex;
- long arglen;
+ crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
+ gplfuturecrcindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1618,8 +1491,10 @@ static struct module *load_module(void __user *umod,
/* Optional sections */
exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+ gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+ gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1655,23 +1530,11 @@ static struct module *load_module(void __user *umod,
}
/* Now copy in args */
- arglen = strlen_user(uargs);
- if (!arglen) {
- err = -EFAULT;
+ args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(args)) {
+ err = PTR_ERR(args);
goto free_hdr;
}
- args = kmalloc(arglen, GFP_KERNEL);
- if (!args) {
- err = -ENOMEM;
- goto free_hdr;
- }
- if (copy_from_user(args, uargs, arglen) != 0) {
- err = -EFAULT;
- goto free_mod;
- }
-
- /* Userspace could have altered the string after the strlen_user() */
- args[arglen - 1] = '\0';
if (find_module(mod->name)) {
err = -EEXIST;
@@ -1755,10 +1618,8 @@ static struct module *load_module(void __user *umod,
if (strcmp(mod->name, "driverloader") == 0)
add_taint(TAINT_PROPRIETARY_MODULE);
-#ifdef CONFIG_MODULE_UNLOAD
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, sechdrs, infoindex);
-#endif
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
@@ -1775,10 +1636,16 @@ static struct module *load_module(void __user *umod,
mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
if (gplcrcindex)
mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+ mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
+ sizeof(*mod->gpl_future_syms);
+ mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
+ if (gplfuturecrcindex)
+ mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !crcindex) ||
- (mod->num_gpl_syms && !gplcrcindex)) {
+ (mod->num_gpl_syms && !gplcrcindex) ||
+ (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
printk(KERN_WARNING "%s: No versions for exported symbols."
" Tainting kernel.\n", mod->name);
add_taint(TAINT_FORCED_MODULE);
@@ -1847,27 +1714,17 @@ static struct module *load_module(void __user *umod,
set_fs(old_fs);
mod->args = args;
- if (obsparmindex) {
- err = obsolete_params(mod->name, mod->args,
- (struct obsolete_modparm *)
- sechdrs[obsparmindex].sh_addr,
- sechdrs[obsparmindex].sh_size
- / sizeof(struct obsolete_modparm),
- sechdrs, symindex,
- (char *)sechdrs[strindex].sh_addr);
- if (setupindex)
- printk(KERN_WARNING "%s: Ignoring new-style "
- "parameters in presence of obsolete ones\n",
- mod->name);
- } else {
- /* Size of section 0 is 0, so this works well if no params */
- err = parse_args(mod->name, mod->args,
- (struct kernel_param *)
- sechdrs[setupindex].sh_addr,
- sechdrs[setupindex].sh_size
- / sizeof(struct kernel_param),
- NULL);
- }
+ if (obsparmindex)
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ mod->name);
+
+ /* Size of section 0 is 0, so this works well if no params */
+ err = parse_args(mod->name, mod->args,
+ (struct kernel_param *)
+ sechdrs[setupindex].sh_addr,
+ sechdrs[setupindex].sh_size
+ / sizeof(struct kernel_param),
+ NULL);
if (err < 0)
goto arch_cleanup;
@@ -1933,13 +1790,13 @@ sys_init_module(void __user *umod,
return -EPERM;
/* Only one module load at a time, please */
- if (down_interruptible(&module_mutex) != 0)
+ if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
/* Do all the hard work */
mod = load_module(umod, len, uargs);
if (IS_ERR(mod)) {
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
return PTR_ERR(mod);
}
@@ -1948,11 +1805,10 @@ sys_init_module(void __user *umod,
stop_machine_run(__link_module, mod, NR_CPUS);
/* Drop lock so they can recurse */
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
- down(&notify_mutex);
- notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
- up(&notify_mutex);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
/* Start the module */
if (mod->init != NULL)
@@ -1967,15 +1823,15 @@ sys_init_module(void __user *umod,
mod->name);
else {
module_put(mod);
- down(&module_mutex);
+ mutex_lock(&module_mutex);
free_module(mod);
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
}
return ret;
}
/* Now it's a first class citizen! */
- down(&module_mutex);
+ mutex_lock(&module_mutex);
mod->state = MODULE_STATE_LIVE;
/* Drop initial reference. */
module_put(mod);
@@ -1983,7 +1839,7 @@ sys_init_module(void __user *umod,
mod->module_init = NULL;
mod->init_size = 0;
mod->init_text_size = 0;
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
return 0;
}
@@ -2073,7 +1929,7 @@ struct module *module_get_kallsym(unsigned int symnum,
{
struct module *mod;
- down(&module_mutex);
+ mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list) {
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
@@ -2081,12 +1937,12 @@ struct module *module_get_kallsym(unsigned int symnum,
strncpy(namebuf,
mod->strtab + mod->symtab[symnum].st_name,
127);
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
return mod;
}
symnum -= mod->num_symtab;
}
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
return NULL;
}
@@ -2129,7 +1985,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
struct list_head *i;
loff_t n = 0;
- down(&module_mutex);
+ mutex_lock(&module_mutex);
list_for_each(i, &modules) {
if (n++ == *pos)
break;
@@ -2150,7 +2006,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
static void m_stop(struct seq_file *m, void *p)
{
- up(&module_mutex);
+ mutex_unlock(&module_mutex);
}
static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c74..f895c7c01d5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,13 +20,16 @@
#include <linux/nmi.h>
#include <linux/kexec.h>
-int panic_timeout;
int panic_on_oops;
int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
+int panic_timeout;
EXPORT_SYMBOL(panic_timeout);
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
@@ -94,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
smp_send_stop();
#endif
- notifier_call_chain(&panic_notifier_list, 0, buf);
+ atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
if (!panic_blink)
panic_blink = no_blink;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
tainted |= flag;
}
EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+ pause_on_oops = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+ int i;
+
+ for (i = 0; i < msecs; i++) {
+ touch_nmi_watchdog();
+ mdelay(1);
+ }
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+ unsigned long flags;
+ static int spin_counter;
+
+ if (!pause_on_oops)
+ return;
+
+ spin_lock_irqsave(&pause_on_oops_lock, flags);
+ if (pause_on_oops_flag == 0) {
+ /* This CPU may now print the oops message */
+ pause_on_oops_flag = 1;
+ } else {
+ /* We need to stall this CPU */
+ if (!spin_counter) {
+ /* This CPU gets to do the counting */
+ spin_counter = pause_on_oops;
+ do {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(MSEC_PER_SEC);
+ spin_lock(&pause_on_oops_lock);
+ } while (--spin_counter);
+ pause_on_oops_flag = 0;
+ } else {
+ /* This CPU waits for a different one */
+ while (spin_counter) {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(1);
+ spin_lock(&pause_on_oops_lock);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info. This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+ return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything. If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option. We do all this
+ * to ensure that oopses don't scroll off the screen. It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+ do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+ do_oops_enter_exit();
+}
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a21..af43ecdc8d9b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
#define DEBUGP(fmt, a...)
#endif
-static inline int dash2underscore(char c)
+static inline char dash2underscore(char c)
{
if (c == '-')
return '_';
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
}
/* We cheat here and temporarily mangle the string. */
-int param_array(const char *name,
- const char *val,
- unsigned int min, unsigned int max,
- void *elem, int elemsize,
- int (*set)(const char *, struct kernel_param *kp),
- int *num)
+static int param_array(const char *name,
+ const char *val,
+ unsigned int min, unsigned int max,
+ void *elem, int elemsize,
+ int (*set)(const char *, struct kernel_param *kp),
+ int *num)
{
int ret;
struct kernel_param kp;
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- if (!try_module_get(mk->mod))
- return -ENODEV;
-
ret = attribute->show(attribute, mk->mod, buf);
- module_put(mk->mod);
-
return ret;
}
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- if (!try_module_get(mk->mod))
- return -ENODEV;
-
ret = attribute->store(attribute, mk->mod, buf, len);
- module_put(mk->mod);
-
return ret;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc07246991..a9f2dfd006d2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,36 +218,6 @@ task_t *find_task_by_pid_type(int type, int nr)
EXPORT_SYMBOL(find_task_by_pid_type);
/*
- * This function switches the PIDs if a non-leader thread calls
- * sys_execve() - this must be done without releasing the PID.
- * (which a detach_pid() would eventually do.)
- */
-void switch_exec_pids(task_t *leader, task_t *thread)
-{
- __detach_pid(leader, PIDTYPE_PID);
- __detach_pid(leader, PIDTYPE_TGID);
- __detach_pid(leader, PIDTYPE_PGID);
- __detach_pid(leader, PIDTYPE_SID);
-
- __detach_pid(thread, PIDTYPE_PID);
- __detach_pid(thread, PIDTYPE_TGID);
-
- leader->pid = leader->tgid = thread->pid;
- thread->pid = thread->tgid;
-
- attach_pid(thread, PIDTYPE_PID, thread->pid);
- attach_pid(thread, PIDTYPE_TGID, thread->tgid);
- attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
- attach_pid(thread, PIDTYPE_SID, thread->signal->session);
- list_add_tail(&thread->tasks, &init_task.tasks);
-
- attach_pid(leader, PIDTYPE_PID, leader->pid);
- attach_pid(leader, PIDTYPE_TGID, leader->tgid);
- attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
- attach_pid(leader, PIDTYPE_SID, leader->signal->session);
-}
-
-/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
* more.
@@ -277,16 +247,8 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- int i;
-
pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+ /* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, pidmap_array->page);
atomic_dec(&pidmap_array->nr_free);
-
- /*
- * Allocate PID 0, and hash it via all PID types:
- */
-
- for (i = 0; i < PIDTYPE_MAX; i++)
- attach_pid(current, i, 0);
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ecf5..ac6dc8744429 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
@@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
struct itimerspec *, struct itimerspec *);
static int common_timer_del(struct k_itimer *timer);
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
@@ -250,15 +251,18 @@ __initcall(init_posix_timers);
static void schedule_next_timer(struct k_itimer *timr)
{
+ struct hrtimer *timer = &timr->it.real.timer;
+
if (timr->it.real.interval.tv64 == 0)
return;
- timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+ timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
timr->it.real.interval);
+
timr->it_overrun_last = timr->it_overrun;
timr->it_overrun = -1;
++timr->it_requeue_pending;
- hrtimer_restart(&timr->it.real.timer);
+ hrtimer_restart(timer);
}
/*
@@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
* This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
*/
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
{
- struct k_itimer *timr = data;
+ struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
int ret = HRTIMER_NORESTART;
+ timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
if (timr->it.real.interval.tv64 != 0)
@@ -350,7 +355,8 @@ static int posix_timer_fn(void *data)
*/
if (timr->it.real.interval.tv64 != 0) {
timr->it_overrun +=
- hrtimer_forward(&timr->it.real.timer,
+ hrtimer_forward(timer,
+ timer->base->softirq_time,
timr->it.real.interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
@@ -602,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
static void
common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
{
- ktime_t remaining;
+ ktime_t now, remaining, iv;
struct hrtimer *timer = &timr->it.real.timer;
memset(cur_setting, 0, sizeof(struct itimerspec));
- remaining = hrtimer_get_remaining(timer);
- /* Time left ? or timer pending */
- if (remaining.tv64 > 0 || hrtimer_active(timer))
- goto calci;
+ iv = timr->it.real.interval;
+
/* interval timer ? */
- if (timr->it.real.interval.tv64 == 0)
+ if (iv.tv64)
+ cur_setting->it_interval = ktime_to_timespec(iv);
+ else if (!hrtimer_active(timer) &&
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
return;
+
+ now = timer->base->get_time();
+
/*
- * When a requeue is pending or this is a SIGEV_NONE timer
- * move the expiry time forward by intervals, so expiry is >
- * now.
+ * When a requeue is pending or this is a SIGEV_NONE
+ * timer move the expiry time forward by intervals, so
+ * expiry is > now.
*/
- if (timr->it_requeue_pending & REQUEUE_PENDING ||
- (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
- timr->it_overrun +=
- hrtimer_forward(timer, timr->it.real.interval);
- remaining = hrtimer_get_remaining(timer);
- }
- calci:
- /* interval timer ? */
- if (timr->it.real.interval.tv64 != 0)
- cur_setting->it_interval =
- ktime_to_timespec(timr->it.real.interval);
+ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ timr->it_overrun += hrtimer_forward(timer, now, iv);
+
+ remaining = ktime_sub(timer->expires, now);
/* Return 0 only, when the timer is expired and not pending */
- if (remaining.tv64 <= 0)
- cur_setting->it_value.tv_nsec = 1;
- else
+ if (remaining.tv64 <= 0) {
+ /*
+ * A single shot SIGEV_NONE timer must return 0, when
+ * it is expired !
+ */
+ if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ cur_setting->it_value.tv_nsec = 1;
+ } else
cur_setting->it_value = ktime_to_timespec(remaining);
}
@@ -716,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
- timr->it.real.timer.data = timr;
timr->it.real.timer.function = posix_timer_fn;
timer->expires = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96a7..8d0af3d37a4b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
obj-y := main.o process.o console.o
obj-$(CONFIG_PM_LEGACY) += pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
obj-$(CONFIG_SUSPEND_SMP) += smp.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..81d4d982f3f0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
#include "power.h"
-extern suspend_disk_method_t pm_disk_mode;
-
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
-extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
-
-
static int noresume = 0;
char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
while(1);
}
-
-static int in_suspend __nosavedata = 0;
-
-
static inline void platform_finish(void)
{
if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
int error;
pm_prepare_console();
- sys_sync();
disable_nonboot_cpus();
if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
if (in_suspend) {
device_resume();
pr_debug("PM: writing image.\n");
- error = swsusp_write(pagedir_nosave, nr_copy_pages);
+ error = swsusp_write();
if (!error)
power_down(pm_disk_mode);
else {
@@ -216,7 +200,7 @@ static int software_resume(void)
pr_debug("PM: Reading swsusp image.\n");
- if ((error = swsusp_read(&pagedir_nosave))) {
+ if ((error = swsusp_read())) {
swsusp_free();
goto Thaw;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4a9..ee371f50ccaa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
}
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
{
int error = 0;
unsigned long flags;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857dd..0f6908cce1dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
#include <linux/pm.h>
#include <linux/pm_legacy.h>
#include <linux/interrupt.h>
+#include <linux/mutex.h>
int pm_active;
@@ -40,7 +41,7 @@ int pm_active;
* until a resume but that will be fine.
*/
-static DECLARE_MUTEX(pm_devs_lock);
+static DEFINE_MUTEX(pm_devs_lock);
static LIST_HEAD(pm_devs);
/**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
dev->id = id;
dev->callback = callback;
- down(&pm_devs_lock);
+ mutex_lock(&pm_devs_lock);
list_add(&dev->entry, &pm_devs);
- up(&pm_devs_lock);
+ mutex_unlock(&pm_devs_lock);
}
return dev;
}
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
void pm_unregister(struct pm_dev *dev)
{
if (dev) {
- down(&pm_devs_lock);
+ mutex_lock(&pm_devs_lock);
list_del(&dev->entry);
- up(&pm_devs_lock);
+ mutex_unlock(&pm_devs_lock);
kfree(dev);
}
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
if (!callback)
return;
- down(&pm_devs_lock);
+ mutex_lock(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
if (dev->callback == callback)
__pm_unregister(dev);
}
- up(&pm_devs_lock);
+ mutex_unlock(&pm_devs_lock);
}
/**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
{
struct list_head *entry;
- down(&pm_devs_lock);
+ mutex_lock(&pm_devs_lock);
entry = pm_devs.next;
while (entry != &pm_devs) {
struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
*/
if (rqst == PM_SUSPEND)
pm_undo_all(dev);
- up(&pm_devs_lock);
+ mutex_unlock(&pm_devs_lock);
return status;
}
}
entry = entry->next;
}
- up(&pm_devs_lock);
+ mutex_unlock(&pm_devs_lock);
return 0;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..f06f12f21767 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
int cpus;
unsigned long image_pages;
unsigned long pages;
+ unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
/* References to section boundaries */
extern const void __nosave_begin, __nosave_end;
-extern unsigned int nr_copy_pages;
extern struct pbe *pagedir_nosave;
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);
extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
+
+struct snapshot_handle {
+ loff_t offset;
+ unsigned int page;
+ unsigned int page_offset;
+ unsigned int prev;
+ struct pbe *pbe;
+ void *buffer;
+ unsigned int buf_offset;
+};
+
+#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
+
+#define SNAPSHOT_IOC_MAGIC '3'
+#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_IOC_MAXNR 11
+
+/**
+ * The bitmap is used for tracing allocated swap pages
+ *
+ * The entire bitmap consists of a number of bitmap_page
+ * structures linked with the help of the .next member.
+ * Thus each page can be allocated individually, so we only
+ * need to make 0-order memory allocations to create
+ * the bitmap.
+ */
+
+#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK (sizeof(long) * 8)
+#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+ unsigned long chunks[BITMAP_PAGE_CHUNKS];
+ struct bitmap_page *next;
+};
+
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern unsigned int snapshot_nr_pages(void);
-extern struct pbe *snapshot_pblist(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+extern int swsusp_suspend(void);
+extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a0b..8ac7c35fad77 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/syscalls.h>
/*
* Timeout for stopping processes
*/
-#define TIMEOUT (6 * HZ)
+#define TIMEOUT (20 * HZ)
static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
current->state = save;
}
+static inline void freeze_process(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (!freezing(p)) {
+ freeze(p);
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ signal_wake_up(p, 0);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+}
+
/* 0 = success, else # of processes that we failed to stop */
int freeze_processes(void)
{
- int todo;
+ int todo, nr_user, user_frozen;
unsigned long start_time;
struct task_struct *g, *p;
unsigned long flags;
printk( "Stopping tasks: " );
start_time = jiffies;
+ user_frozen = 0;
do {
- todo = 0;
+ nr_user = todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
if (!freezeable(p))
continue;
if (frozen(p))
continue;
-
- freeze(p);
- spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- todo++;
+ if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+ /* The task is a user-space one.
+ * Freeze it unless there's a vfork completion
+ * pending
+ */
+ if (!p->vfork_done)
+ freeze_process(p);
+ nr_user++;
+ } else {
+ /* Freeze only if the user space is frozen */
+ if (user_frozen)
+ freeze_process(p);
+ todo++;
+ }
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+ todo += nr_user;
+ if (!user_frozen && !nr_user) {
+ sys_sync();
+ start_time = jiffies;
+ }
+ user_frozen = !nr_user;
yield(); /* Yield is okay here */
- if (todo && time_after(jiffies, start_time + TIMEOUT)) {
- printk( "\n" );
- printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+ if (todo && time_after(jiffies, start_time + TIMEOUT))
break;
- }
} while(todo);
/* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
* but it cleans up leftover PF_FREEZE requests.
*/
if (todo) {
+ printk( "\n" );
+ printk(KERN_ERR " stopping tasks timed out "
+ "after %d seconds (%d tasks remaining):\n",
+ TIMEOUT / HZ, todo);
read_lock(&tasklist_lock);
- do_each_thread(g, p)
+ do_each_thread(g, p) {
+ if (freezeable(p) && !frozen(p))
+ printk(KERN_ERR " %s\n", p->comm);
if (freezing(p)) {
pr_debug(" clean up: %s\n", p->comm);
p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
recalc_sigpending_tsk(p);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
- while_each_thread(g, p);
+ } while_each_thread(g, p);
read_unlock(&tasklist_lock);
return todo;
}
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b8225..5957312b2d68 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
printk("Thawing cpus ...\n");
for_each_cpu_mask(cpu, frozen_cpus) {
- error = smp_prepare_cpu(cpu);
- if (!error)
- error = cpu_up(cpu);
+ error = cpu_up(cpu);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..c5863d02c89e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
*/
+#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
@@ -34,7 +35,9 @@
#include "power.h"
struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
+static unsigned long *buffer;
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
void *kaddr;
unsigned long pfn = zone_pfn + zone->zone_start_pfn;
- if (!(pfn%1000))
+ if (!(pfn%10000))
printk(".");
if (!pfn_valid(pfn))
continue;
@@ -119,13 +122,15 @@ int save_highmem(void)
struct zone *zone;
int res = 0;
- pr_debug("swsusp: Saving Highmem\n");
+ pr_debug("swsusp: Saving Highmem");
+ drain_local_pages();
for_each_zone (zone) {
if (is_highmem(zone))
res = save_highmem_zone(zone);
if (res)
return res;
}
+ printk("\n");
return 0;
}
@@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
* free_pagedir - free pages allocated with alloc_pagedir()
*/
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
{
struct pbe *pbe;
@@ -301,7 +306,7 @@ struct eaten_page {
static struct eaten_page *eaten_pages = NULL;
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
{
struct eaten_page *p, *q;
@@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
if (!nr_pages)
return NULL;
- pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
pblist = alloc_image_page(gfp_mask, safe_needed);
/* FIXME: rewrite this ugly loop */
for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
free_pagedir(pblist);
pblist = NULL;
} else
- create_pbe_list(pblist, nr_pages);
+ create_pbe_list(pblist, nr_pages);
return pblist;
}
@@ -414,6 +418,10 @@ void swsusp_free(void)
}
}
}
+ nr_copy_pages = 0;
+ nr_meta_pages = 0;
+ pagedir_nosave = NULL;
+ buffer = NULL;
}
@@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages)
(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
}
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
{
struct pbe *p;
@@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void)
*/
nr_copy_pages = nr_pages;
+ nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
return 0;
}
+
+static void init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
+ info->version_code = LINUX_VERSION_CODE;
+ info->num_physpages = num_physpages;
+ memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+ info->cpus = num_online_cpus();
+ info->image_pages = nr_copy_pages;
+ info->pages = nr_copy_pages + nr_meta_pages + 1;
+ info->size = info->pages;
+ info->size <<= PAGE_SHIFT;
+}
+
+/**
+ * pack_orig_addresses - the .orig_address fields of the PBEs from the
+ * list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+ buf[j] = pbe->orig_address;
+ pbe = pbe->next;
+ }
+ if (!pbe)
+ for (; j < PAGE_SIZE / sizeof(long); j++)
+ buf[j] = 0;
+ return pbe;
+}
+
+/**
+ * snapshot_read_next - used for reading the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * The @count parameter should contain the number of bytes the caller
+ * wants to read from the snapshot. It must not be zero.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro. The number returned
+ * may be smaller than @count, but this only happens if the read would
+ * cross a page boundary otherwise.
+ *
+ * The function returns 0 to indicate the end of data stream condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+ if (handle->page > nr_meta_pages + nr_copy_pages)
+ return 0;
+ if (!buffer) {
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = alloc_image_page(GFP_ATOMIC, 0);
+ if (!buffer)
+ return -ENOMEM;
+ }
+ if (!handle->offset) {
+ init_header((struct swsusp_info *)buffer);
+ handle->buffer = buffer;
+ handle->pbe = pagedir_nosave;
+ }
+ if (handle->prev < handle->page) {
+ if (handle->page <= nr_meta_pages) {
+ handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+ if (!handle->pbe)
+ handle->pbe = pagedir_nosave;
+ } else {
+ handle->buffer = (void *)handle->pbe->address;
+ handle->pbe = handle->pbe->next;
+ }
+ handle->prev = handle->page;
+ }
+ handle->buf_offset = handle->page_offset;
+ if (handle->page_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->page_offset;
+ handle->page_offset = 0;
+ handle->page++;
+ } else {
+ handle->page_offset += count;
+ }
+ handle->offset += count;
+ return count;
+}
+
+/**
+ * mark_unsafe_pages - mark the pages that cannot be used for storing
+ * the image during resume, because they conflict with the pages that
+ * had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+ struct zone *zone;
+ unsigned long zone_pfn;
+ struct pbe *p;
+
+ if (!pblist) /* a sanity check */
+ return -EINVAL;
+
+ /* Clear page flags */
+ for_each_zone (zone) {
+ for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+ if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+ ClearPageNosaveFree(pfn_to_page(zone_pfn +
+ zone->zone_start_pfn));
+ }
+
+ /* Mark orig addresses */
+ for_each_pbe (p, pblist) {
+ if (virt_addr_valid(p->orig_address))
+ SetPageNosaveFree(virt_to_page(p->orig_address));
+ else
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+ /* We assume both lists contain the same number of elements */
+ while (src) {
+ dst->orig_address = src->orig_address;
+ dst = dst->next;
+ src = src->next;
+ }
+}
+
+static int check_header(struct swsusp_info *info)
+{
+ char *reason = NULL;
+
+ if (info->version_code != LINUX_VERSION_CODE)
+ reason = "kernel version";
+ if (info->num_physpages != num_physpages)
+ reason = "memory size";
+ if (strcmp(info->uts.sysname,system_utsname.sysname))
+ reason = "system type";
+ if (strcmp(info->uts.release,system_utsname.release))
+ reason = "kernel release";
+ if (strcmp(info->uts.version,system_utsname.version))
+ reason = "version";
+ if (strcmp(info->uts.machine,system_utsname.machine))
+ reason = "machine";
+ if (reason) {
+ printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+ return -EPERM;
+ }
+ return 0;
+}
+
+/**
+ * load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+ struct swsusp_info *info)
+{
+ int error;
+ struct pbe *pblist;
+
+ error = check_header(info);
+ if (!error) {
+ pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+ if (!pblist)
+ return -ENOMEM;
+ pagedir_nosave = pblist;
+ handle->pbe = pblist;
+ nr_copy_pages = info->image_pages;
+ nr_meta_pages = info->pages - info->image_pages - 1;
+ }
+ return error;
+}
+
+/**
+ * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ * the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+ struct pbe *pbe)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+ pbe->orig_address = buf[j];
+ pbe = pbe->next;
+ }
+ return pbe;
+}
+
+/**
+ * create_image - use metadata contained in the PBE list
+ * pointed to by pagedir_nosave to mark the pages that will
+ * be overwritten in the process of restoring the system
+ * memory state from the image and allocate memory for
+ * the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+ int error = 0;
+ struct pbe *p, *pblist;
+
+ p = pagedir_nosave;
+ error = mark_unsafe_pages(p);
+ if (!error) {
+ pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+ if (pblist)
+ copy_page_backup_list(pblist, p);
+ free_pagedir(p);
+ if (!pblist)
+ error = -ENOMEM;
+ }
+ if (!error)
+ error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+ if (!error) {
+ release_eaten_pages();
+ pagedir_nosave = pblist;
+ } else {
+ pagedir_nosave = NULL;
+ handle->pbe = NULL;
+ nr_copy_pages = 0;
+ nr_meta_pages = 0;
+ }
+ return error;
+}
+
+/**
+ * snapshot_write_next - used for writing the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * The @count parameter should contain the number of bytes the caller
+ * wants to write to the image. It must not be zero.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro. The number returned
+ * may be smaller than @count, but this only happens if the write would
+ * cross a page boundary otherwise.
+ *
+ * The function returns 0 to indicate the "end of file" condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+ int error = 0;
+
+ if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+ return 0;
+ if (!buffer) {
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = alloc_image_page(GFP_ATOMIC, 0);
+ if (!buffer)
+ return -ENOMEM;
+ }
+ if (!handle->offset)
+ handle->buffer = buffer;
+ if (handle->prev < handle->page) {
+ if (!handle->prev) {
+ error = load_header(handle, (struct swsusp_info *)buffer);
+ if (error)
+ return error;
+ } else if (handle->prev <= nr_meta_pages) {
+ handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+ if (!handle->pbe) {
+ error = create_image(handle);
+ if (error)
+ return error;
+ handle->pbe = pagedir_nosave;
+ handle->buffer = (void *)handle->pbe->address;
+ }
+ } else {
+ handle->pbe = handle->pbe->next;
+ handle->buffer = (void *)handle->pbe->address;
+ }
+ handle->prev = handle->page;
+ }
+ handle->buf_offset = handle->page_offset;
+ if (handle->page_offset + count >= PAGE_SIZE) {
+ count = PAGE_SIZE - handle->page_offset;
+ handle->page_offset = 0;
+ handle->page++;
+ } else {
+ handle->page_offset += count;
+ }
+ handle->offset += count;
+ return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+ return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+ handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 000000000000..044b8e0c1025
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,545 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+extern char resume_file[];
+
+#define SWSUSP_SIG "S1SUSPEND"
+
+static struct swsusp_header {
+ char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+ swp_entry_t image;
+ char orig_sig[10];
+ char sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+/*
+ * Saving part...
+ */
+
+static unsigned short root_swap = 0xffff;
+
+static int mark_swapfiles(swp_entry_t start)
+{
+ int error;
+
+ rw_swap_page_sync(READ,
+ swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)&swsusp_header));
+ if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+ !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+ memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+ memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+ swsusp_header.image = start;
+ error = rw_swap_page_sync(WRITE,
+ swp_entry(root_swap, 0),
+ virt_to_page((unsigned long)
+ &swsusp_header));
+ } else {
+ pr_debug("swsusp: Partition is not swap space.\n");
+ error = -ENODEV;
+ }
+ return error;
+}
+
+/**
+ * swsusp_swap_check - check if the resume device is a swap device
+ * and get its index (if so)
+ */
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+ int res = swap_type_of(swsusp_resume_device);
+
+ if (res >= 0) {
+ root_swap = res;
+ return 0;
+ }
+ return res;
+}
+
+/**
+ * write_page - Write one page to given swap location.
+ * @buf: Address we're writing.
+ * @offset: Offset of the swap page we're writing to.
+ */
+
+static int write_page(void *buf, unsigned long offset)
+{
+ swp_entry_t entry;
+ int error = -ENOSPC;
+
+ if (offset) {
+ entry = swp_entry(root_swap, offset);
+ error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+ }
+ return error;
+}
+
+/*
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition. It consists of many swap_map_page
+ * structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ * These structures are stored on the swap and linked together with the
+ * help of the .next_swap member.
+ *
+ * The swap map is created during suspend. The swap map pages are
+ * allocated and populated one at a time, so we only need one memory
+ * page to set up the entire structure.
+ *
+ * During resume we also only need to use one swap_map_page structure
+ * at a time.
+ */
+
+#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+ unsigned long entries[MAP_PAGE_ENTRIES];
+ unsigned long next_swap;
+};
+
+/**
+ * The swap_map_handle structure is used for handling swap in
+ * a file-alike way
+ */
+
+struct swap_map_handle {
+ struct swap_map_page *cur;
+ unsigned long cur_swap;
+ struct bitmap_page *bitmap;
+ unsigned int k;
+};
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur)
+ free_page((unsigned long)handle->cur);
+ handle->cur = NULL;
+ if (handle->bitmap)
+ free_bitmap(handle->bitmap);
+ handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+ handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+ if (!handle->cur)
+ return -ENOMEM;
+ handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+ if (!handle->bitmap) {
+ release_swap_writer(handle);
+ return -ENOMEM;
+ }
+ handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+ if (!handle->cur_swap) {
+ release_swap_writer(handle);
+ return -ENOSPC;
+ }
+ handle->k = 0;
+ return 0;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
+{
+ int error;
+ unsigned long offset;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = alloc_swap_page(root_swap, handle->bitmap);
+ error = write_page(buf, offset);
+ if (error)
+ return error;
+ handle->cur->entries[handle->k++] = offset;
+ if (handle->k >= MAP_PAGE_ENTRIES) {
+ offset = alloc_swap_page(root_swap, handle->bitmap);
+ if (!offset)
+ return -ENOSPC;
+ handle->cur->next_swap = offset;
+ error = write_page(handle->cur, handle->cur_swap);
+ if (error)
+ return error;
+ memset(handle->cur, 0, PAGE_SIZE);
+ handle->cur_swap = offset;
+ handle->k = 0;
+ }
+ return 0;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur && handle->cur_swap)
+ return write_page(handle->cur, handle->cur_swap);
+ else
+ return -EINVAL;
+}
+
+/**
+ * save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_pages)
+{
+ unsigned int m;
+ int ret;
+ int error = 0;
+
+ printk("Saving image data pages (%u pages) ... ", nr_pages);
+ m = nr_pages / 100;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ do {
+ ret = snapshot_read_next(snapshot, PAGE_SIZE);
+ if (ret > 0) {
+ error = swap_write_page(handle, data_of(*snapshot));
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ } while (ret > 0);
+ if (!error)
+ printk("\b\b\b\bdone\n");
+ return error;
+}
+
+/**
+ * enough_swap - Make sure we have enough swap to save the image.
+ *
+ * Returns TRUE or FALSE after checking the total amount of swap
+ * space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+ unsigned int free_swap = count_swap_pages(root_swap, 1);
+
+ pr_debug("swsusp: free swap pages: %u\n", free_swap);
+ return free_swap > (nr_pages + PAGES_FOR_IO +
+ (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
+/**
+ * swsusp_write - Write entire image and metadata.
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want
+ * them synced (in case something goes wrong) but we DO not want to mark
+ * filesystem clean: it is not. (And it does not matter, if we resume
+ * correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(void)
+{
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+ unsigned long start;
+ int error;
+
+ if ((error = swsusp_swap_check())) {
+ printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+ return error;
+ }
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_read_next(&snapshot, PAGE_SIZE);
+ if (error < PAGE_SIZE)
+ return error < 0 ? error : -EFAULT;
+ header = (struct swsusp_info *)data_of(snapshot);
+ if (!enough_swap(header->pages)) {
+ printk(KERN_ERR "swsusp: Not enough free swap\n");
+ return -ENOSPC;
+ }
+ error = get_swap_writer(&handle);
+ if (!error) {
+ start = handle.cur_swap;
+ error = swap_write_page(&handle, header);
+ }
+ if (!error)
+ error = save_image(&handle, &snapshot, header->pages - 1);
+ if (!error) {
+ flush_swap_writer(&handle);
+ printk("S");
+ error = mark_swapfiles(swp_entry(root_swap, start));
+ printk("|\n");
+ }
+ if (error)
+ free_all_swap_pages(root_swap, handle.bitmap);
+ release_swap_writer(&handle);
+ return error;
+}
+
+/*
+ * Using bio to read from swap.
+ * This code requires a bit more work than just using buffer heads
+ * but, it is the recommended way for 2.5/2.6.
+ * The following are to signal the beginning and end of I/O. Bios
+ * finish asynchronously, while we want them to happen synchronously.
+ * A simple atomic_t, and a wait loop take care of this problem.
+ */
+
+static atomic_t io_done = ATOMIC_INIT(0);
+
+static int end_io(struct bio *bio, unsigned int num, int err)
+{
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ panic("I/O error reading memory image");
+ atomic_set(&io_done, 0);
+ return 0;
+}
+
+static struct block_device *resume_bdev;
+
+/**
+ * submit - submit BIO request.
+ * @rw: READ or WRITE.
+ * @off physical offset of page.
+ * @page: page we're reading or writing.
+ *
+ * Straight from the textbook - allocate and initialize the bio.
+ * If we're writing, make sure the page is marked as dirty.
+ * Then submit it and wait.
+ */
+
+static int submit(int rw, pgoff_t page_off, void *page)
+{
+ int error = 0;
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_ATOMIC, 1);
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio->bi_bdev = resume_bdev;
+ bio->bi_end_io = end_io;
+
+ if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+ error = -EFAULT;
+ goto Done;
+ }
+
+ atomic_set(&io_done, 1);
+ submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+ while (atomic_read(&io_done))
+ yield();
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ Done:
+ bio_put(bio);
+ return error;
+}
+
+static int bio_read_page(pgoff_t page_off, void *page)
+{
+ return submit(READ, page_off, page);
+}
+
+static int bio_write_page(pgoff_t page_off, void *page)
+{
+ return submit(WRITE, page_off, page);
+}
+
+/**
+ * The following functions allow us to read data using a swap map
+ * in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+ if (handle->cur)
+ free_page((unsigned long)handle->cur);
+ handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle,
+ swp_entry_t start)
+{
+ int error;
+
+ if (!swp_offset(start))
+ return -EINVAL;
+ handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!handle->cur)
+ return -ENOMEM;
+ error = bio_read_page(swp_offset(start), handle->cur);
+ if (error) {
+ release_swap_reader(handle);
+ return error;
+ }
+ handle->k = 0;
+ return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
+{
+ unsigned long offset;
+ int error;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = handle->cur->entries[handle->k];
+ if (!offset)
+ return -EFAULT;
+ error = bio_read_page(offset, buf);
+ if (error)
+ return error;
+ if (++handle->k >= MAP_PAGE_ENTRIES) {
+ handle->k = 0;
+ offset = handle->cur->next_swap;
+ if (!offset)
+ release_swap_reader(handle);
+ else
+ error = bio_read_page(offset, handle->cur);
+ }
+ return error;
+}
+
+/**
+ * load_image - load the image using the swap map handle
+ * @handle and the snapshot handle @snapshot
+ * (assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_pages)
+{
+ unsigned int m;
+ int ret;
+ int error = 0;
+
+ printk("Loading image data pages (%u pages) ... ", nr_pages);
+ m = nr_pages / 100;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ do {
+ ret = snapshot_write_next(snapshot, PAGE_SIZE);
+ if (ret > 0) {
+ error = swap_read_page(handle, data_of(*snapshot));
+ if (error)
+ break;
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+ }
+ } while (ret > 0);
+ if (!error) {
+ printk("\b\b\b\bdone\n");
+ if (!snapshot_image_loaded(snapshot))
+ error = -ENODATA;
+ }
+ return error;
+}
+
+int swsusp_read(void)
+{
+ int error;
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+
+ if (IS_ERR(resume_bdev)) {
+ pr_debug("swsusp: block device not initialised\n");
+ return PTR_ERR(resume_bdev);
+ }
+
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_write_next(&snapshot, PAGE_SIZE);
+ if (error < PAGE_SIZE)
+ return error < 0 ? error : -EFAULT;
+ header = (struct swsusp_info *)data_of(snapshot);
+ error = get_swap_reader(&handle, swsusp_header.image);
+ if (!error)
+ error = swap_read_page(&handle, header);
+ if (!error)
+ error = load_image(&handle, &snapshot, header->pages - 1);
+ release_swap_reader(&handle);
+
+ blkdev_put(resume_bdev);
+
+ if (!error)
+ pr_debug("swsusp: Reading resume file was successful\n");
+ else
+ pr_debug("swsusp: Error %d resuming\n", error);
+ return error;
+}
+
+/**
+ * swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+ int error;
+
+ resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+ if (!IS_ERR(resume_bdev)) {
+ set_blocksize(resume_bdev, PAGE_SIZE);
+ memset(&swsusp_header, 0, sizeof(swsusp_header));
+ if ((error = bio_read_page(0, &swsusp_header)))
+ return error;
+ if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+ memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+ /* Reset swap signature now */
+ error = bio_write_page(0, &swsusp_header);
+ } else {
+ return -EINVAL;
+ }
+ if (error)
+ blkdev_put(resume_bdev);
+ else
+ pr_debug("swsusp: Signature found, resuming\n");
+ } else {
+ error = PTR_ERR(resume_bdev);
+ }
+
+ if (error)
+ pr_debug("swsusp: Error %d check for resume file\n", error);
+
+ return error;
+}
+
+/**
+ * swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+ if (IS_ERR(resume_bdev)) {
+ pr_debug("swsusp: block device not initialised\n");
+ return;
+ }
+
+ blkdev_put(resume_bdev);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f76..c4016cbbd3e0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
* Fixed runaway init
*
* Rafael J. Wysocki <rjw@sisk.pl>
- * Added the swap map data structure and reworked the handling of swap
+ * Reworked the freeing of memory and the handling of swap
*
* More state savers are welcome. Especially for the scsi layer...
*
* For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
*/
-#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
-#include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
-#include <linux/delay.h>
-#include <linux/bitops.h>
#include <linux/spinlock.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/swap.h>
#include <linux/pm.h>
-#include <linux/device.h>
-#include <linux/buffer_head.h>
#include <linux/swapops.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/highmem.h>
-#include <linux/bio.h>
-
-#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
#include "power.h"
@@ -77,6 +60,8 @@
*/
unsigned long image_size = 500 * 1024 * 1024;
+int in_suspend __nosavedata = 0;
+
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void);
int save_highmem(void);
@@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; }
static unsigned int count_highmem_pages(void) { return 0; }
#endif
-extern char resume_file[];
-
-#define SWSUSP_SIG "S1SUSPEND"
-
-static struct swsusp_header {
- char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
- swp_entry_t image;
- char orig_sig[10];
- char sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
-
-static struct swsusp_info swsusp_info;
-
-/*
- * Saving part...
- */
-
-static unsigned short root_swap = 0xffff;
-
-static int mark_swapfiles(swp_entry_t start)
-{
- int error;
-
- rw_swap_page_sync(READ,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)&swsusp_header));
- if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
- !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
- memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
- memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
- swsusp_header.image = start;
- error = rw_swap_page_sync(WRITE,
- swp_entry(root_swap, 0),
- virt_to_page((unsigned long)
- &swsusp_header));
- } else {
- pr_debug("swsusp: Partition is not swap space.\n");
- error = -ENODEV;
- }
- return error;
-}
-
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed. You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device. This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
- */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
- struct file *file = swap_info->swap_file;
- struct inode *inode = file->f_dentry->d_inode;
-
- return S_ISBLK(inode->i_mode) &&
- swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
-
-static int swsusp_swap_check(void) /* This is called before saving image */
-{
- int i;
-
- spin_lock(&swap_lock);
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (!(swap_info[i].flags & SWP_WRITEOK))
- continue;
- if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
- spin_unlock(&swap_lock);
- root_swap = i;
- return 0;
- }
- }
- spin_unlock(&swap_lock);
- return -ENODEV;
-}
-
-/**
- * write_page - Write one page to a fresh swap location.
- * @addr: Address we're writing.
- * @loc: Place to store the entry we used.
- *
- * Allocate a new swap entry and 'sync' it. Note we discard -EIO
- * errors. That is an artifact left over from swsusp. It did not
- * check the return of rw_swap_page_sync() at all, since most pages
- * written back to swap would return -EIO.
- * This is a partial improvement, since we will at least return other
- * errors, though we need to eventually fix the damn code.
- */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
- swp_entry_t entry;
- int error = -ENOSPC;
-
- entry = get_swap_page_of_type(root_swap);
- if (swp_offset(entry)) {
- error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
- if (!error || error == -EIO)
- *loc = entry;
- }
- return error;
-}
-
/**
- * Swap map-handling functions
- *
- * The swap map is a data structure used for keeping track of each page
- * written to the swap. It consists of many swap_map_page structures
- * that contain each an array of MAP_PAGE_SIZE swap entries.
- * These structures are linked together with the help of either the
- * .next (in memory) or the .next_swap (in swap) member.
+ * The following functions are used for tracing the allocated
+ * swap pages, so that they can be freed in case of an error.
*
- * The swap map is created during suspend. At that time we need to keep
- * it in memory, because we have to free all of the allocated swap
- * entries if an error occurs. The memory needed is preallocated
- * so that we know in advance if there's enough of it.
- *
- * The first swap_map_page structure is filled with the swap entries that
- * correspond to the first MAP_PAGE_SIZE data pages written to swap and
- * so on. After the all of the data pages have been written, the order
- * of the swap_map_page structures in the map is reversed so that they
- * can be read from swap in the original order. This causes the data
- * pages to be loaded in exactly the same order in which they have been
- * saved.
- *
- * During resume we only need to use one swap_map_page structure
- * at a time, which means that we only need to use two memory pages for
- * reading the image - one for reading the swap_map_page structures
- * and the second for reading the data pages from swap.
+ * The functions operate on a linked bitmap structure defined
+ * in power.h
*/
-#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
- / sizeof(swp_entry_t))
-
-struct swap_map_page {
- swp_entry_t entries[MAP_PAGE_SIZE];
- swp_entry_t next_swap;
- struct swap_map_page *next;
-};
-
-static inline void free_swap_map(struct swap_map_page *swap_map)
+void free_bitmap(struct bitmap_page *bitmap)
{
- struct swap_map_page *swp;
+ struct bitmap_page *bp;
- while (swap_map) {
- swp = swap_map->next;
- free_page((unsigned long)swap_map);
- swap_map = swp;
+ while (bitmap) {
+ bp = bitmap->next;
+ free_page((unsigned long)bitmap);
+ bitmap = bp;
}
}
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
{
- struct swap_map_page *swap_map, *swp;
- unsigned n = 0;
+ struct bitmap_page *bitmap, *bp;
+ unsigned int n;
- if (!nr_pages)
+ if (!nr_bits)
return NULL;
- pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
- swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
- swp = swap_map;
- for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
- swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
- swp = swp->next;
- if (!swp) {
- free_swap_map(swap_map);
+ bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+ bp = bitmap;
+ for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+ bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+ bp = bp->next;
+ if (!bp) {
+ free_bitmap(bitmap);
return NULL;
}
}
- return swap_map;
+ return bitmap;
}
-/**
- * reverse_swap_map - reverse the order of pages in the swap map
- * @swap_map
- */
-
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
-{
- struct swap_map_page *prev, *next;
-
- prev = NULL;
- while (swap_map) {
- next = swap_map->next;
- swap_map->next = prev;
- prev = swap_map;
- swap_map = next;
- }
- return prev;
-}
-
-/**
- * free_swap_map_entries - free the swap entries allocated to store
- * the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
-{
- while (swap_map) {
- if (swap_map->next_swap.val)
- swap_free(swap_map->next_swap);
- swap_map = swap_map->next;
- }
-}
-
-/**
- * save_swap_map - save the swap map used for tracing the data pages
- * stored in the swap
- */
-
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
-{
- swp_entry_t entry = (swp_entry_t){0};
- int error;
-
- while (swap_map) {
- swap_map->next_swap = entry;
- if ((error = write_page((unsigned long)swap_map, &entry)))
- return error;
- swap_map = swap_map->next;
- }
- *start = entry;
- return 0;
-}
-
-/**
- * free_image_entries - free the swap entries allocated to store
- * the image data pages (this is only called in case of an error)
- */
-
-static inline void free_image_entries(struct swap_map_page *swp)
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
{
- unsigned k;
+ unsigned int n;
- while (swp) {
- for (k = 0; k < MAP_PAGE_SIZE; k++)
- if (swp->entries[k].val)
- swap_free(swp->entries[k]);
- swp = swp->next;
+ n = BITMAP_PAGE_BITS;
+ while (bitmap && n <= bit) {
+ n += BITMAP_PAGE_BITS;
+ bitmap = bitmap->next;
}
-}
-
-/**
- * The swap_map_handle structure is used for handling the swap map in
- * a file-alike way
- */
-
-struct swap_map_handle {
- struct swap_map_page *cur;
- unsigned int k;
-};
-
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
- struct swap_map_page *map)
-{
- handle->cur = map;
- handle->k = 0;
-}
-
-static inline int swap_map_write_page(struct swap_map_handle *handle,
- unsigned long addr)
-{
- int error;
-
- error = write_page(addr, handle->cur->entries + handle->k);
- if (error)
- return error;
- if (++handle->k >= MAP_PAGE_SIZE) {
- handle->cur = handle->cur->next;
- handle->k = 0;
+ if (!bitmap)
+ return -EINVAL;
+ n -= BITMAP_PAGE_BITS;
+ bit -= n;
+ n = 0;
+ while (bit >= BITS_PER_CHUNK) {
+ bit -= BITS_PER_CHUNK;
+ n++;
}
+ bitmap->chunks[n] |= (1UL << bit);
return 0;
}
-/**
- * save_image_data - save the data pages pointed to by the PBEs
- * from the list @pblist using the swap map handle @handle
- * (assume there are @nr_pages data pages to save)
- */
-
-static int save_image_data(struct pbe *pblist,
- struct swap_map_handle *handle,
- unsigned int nr_pages)
-{
- unsigned int m;
- struct pbe *p;
- int error = 0;
-
- printk("Saving image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
- if (!m)
- m = 1;
- nr_pages = 0;
- for_each_pbe (p, pblist) {
- error = swap_map_write_page(handle, p->address);
- if (error)
- break;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
- }
- if (!error)
- printk("\b\b\b\bdone\n");
- return error;
-}
-
-static void dump_info(void)
-{
- pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
- pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
- pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
- pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
- pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
- pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
- pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
- pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
- pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
- pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
- pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-
-static void init_header(unsigned int nr_pages)
-{
- memset(&swsusp_info, 0, sizeof(swsusp_info));
- swsusp_info.version_code = LINUX_VERSION_CODE;
- swsusp_info.num_physpages = num_physpages;
- memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-
- swsusp_info.cpus = num_online_cpus();
- swsusp_info.image_pages = nr_pages;
- swsusp_info.pages = nr_pages +
- ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-
-/**
- * pack_orig_addresses - the .orig_address fields of the PBEs from the
- * list starting at @pbe are stored in the array @buf[] (1 page)
- */
-
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
- struct pbe *pbe)
-{
- int j;
-
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- buf[j] = pbe->orig_address;
- pbe = pbe->next;
- }
- if (!pbe)
- for (; j < PAGE_SIZE / sizeof(long); j++)
- buf[j] = 0;
- return pbe;
-}
-
-/**
- * save_image_metadata - save the .orig_address fields of the PBEs
- * from the list @pblist using the swap map handle @handle
- */
-
-static int save_image_metadata(struct pbe *pblist,
- struct swap_map_handle *handle)
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
{
- unsigned long *buf;
- unsigned int n = 0;
- struct pbe *p;
- int error = 0;
+ unsigned long offset;
- printk("Saving image metadata ... ");
- buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
- if (!buf)
- return -ENOMEM;
- p = pblist;
- while (p) {
- p = pack_orig_addresses(buf, p);
- error = swap_map_write_page(handle, (unsigned long)buf);
- if (error)
- break;
- n++;
+ offset = swp_offset(get_swap_page_of_type(swap));
+ if (offset) {
+ if (bitmap_set(bitmap, offset)) {
+ swap_free(swp_entry(swap, offset));
+ offset = 0;
+ }
}
- free_page((unsigned long)buf);
- if (!error)
- printk("done (%u pages saved)\n", n);
- return error;
+ return offset;
}
-/**
- * enough_swap - Make sure we have enough swap to save the image.
- *
- * Returns TRUE or FALSE after checking the total amount of swap
- * space avaiable from the resume partition.
- */
-
-static int enough_swap(unsigned int nr_pages)
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
{
- unsigned int free_swap = swap_info[root_swap].pages -
- swap_info[root_swap].inuse_pages;
-
- pr_debug("swsusp: free swap pages: %u\n", free_swap);
- return free_swap > (nr_pages + PAGES_FOR_IO +
- (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
+ unsigned int bit, n;
+ unsigned long test;
-/**
- * swsusp_write - Write entire image and metadata.
- *
- * It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
- */
-
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
- struct swap_map_page *swap_map;
- struct swap_map_handle handle;
- swp_entry_t start;
- int error;
-
- if ((error = swsusp_swap_check())) {
- printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
- return error;
- }
- if (!enough_swap(nr_pages)) {
- printk(KERN_ERR "swsusp: Not enough free swap\n");
- return -ENOSPC;
+ bit = 0;
+ while (bitmap) {
+ for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+ for (test = 1UL; test; test <<= 1) {
+ if (bitmap->chunks[n] & test)
+ swap_free(swp_entry(swap, bit));
+ bit++;
+ }
+ bitmap = bitmap->next;
}
-
- init_header(nr_pages);
- swap_map = alloc_swap_map(swsusp_info.pages);
- if (!swap_map)
- return -ENOMEM;
- init_swap_map_handle(&handle, swap_map);
-
- error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
- if (!error)
- error = save_image_metadata(pblist, &handle);
- if (!error)
- error = save_image_data(pblist, &handle, nr_pages);
- if (error)
- goto Free_image_entries;
-
- swap_map = reverse_swap_map(swap_map);
- error = save_swap_map(swap_map, &start);
- if (error)
- goto Free_map_entries;
-
- dump_info();
- printk( "S" );
- error = mark_swapfiles(start);
- printk( "|\n" );
- if (error)
- goto Free_map_entries;
-
-Free_swap_map:
- free_swap_map(swap_map);
- return error;
-
-Free_map_entries:
- free_swap_map_entries(swap_map);
-Free_image_entries:
- free_image_entries(swap_map);
- goto Free_swap_map;
}
/**
@@ -660,379 +271,3 @@ int swsusp_resume(void)
local_irq_enable();
return error;
}
-
-/**
- * mark_unsafe_pages - mark the pages that cannot be used for storing
- * the image during resume, because they conflict with the pages that
- * had been used before suspend
- */
-
-static void mark_unsafe_pages(struct pbe *pblist)
-{
- struct zone *zone;
- unsigned long zone_pfn;
- struct pbe *p;
-
- if (!pblist) /* a sanity check */
- return;
-
- /* Clear page flags */
- for_each_zone (zone) {
- for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
- if (pfn_valid(zone_pfn + zone->zone_start_pfn))
- ClearPageNosaveFree(pfn_to_page(zone_pfn +
- zone->zone_start_pfn));
- }
-
- /* Mark orig addresses */
- for_each_pbe (p, pblist)
- SetPageNosaveFree(virt_to_page(p->orig_address));
-
-}
-
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
- /* We assume both lists contain the same number of elements */
- while (src) {
- dst->orig_address = src->orig_address;
- dst = dst->next;
- src = src->next;
- }
-}
-
-/*
- * Using bio to read from swap.
- * This code requires a bit more work than just using buffer heads
- * but, it is the recommended way for 2.5/2.6.
- * The following are to signal the beginning and end of I/O. Bios
- * finish asynchronously, while we want them to happen synchronously.
- * A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- panic("I/O error reading memory image");
- atomic_set(&io_done, 0);
- return 0;
-}
-
-static struct block_device *resume_bdev;
-
-/**
- * submit - submit BIO request.
- * @rw: READ or WRITE.
- * @off physical offset of page.
- * @page: page we're reading or writing.
- *
- * Straight from the textbook - allocate and initialize the bio.
- * If we're writing, make sure the page is marked as dirty.
- * Then submit it and wait.
- */
-
-static int submit(int rw, pgoff_t page_off, void *page)
-{
- int error = 0;
- struct bio *bio;
-
- bio = bio_alloc(GFP_ATOMIC, 1);
- if (!bio)
- return -ENOMEM;
- bio->bi_sector = page_off * (PAGE_SIZE >> 9);
- bio->bi_bdev = resume_bdev;
- bio->bi_end_io = end_io;
-
- if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
- printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
- error = -EFAULT;
- goto Done;
- }
-
-
- atomic_set(&io_done, 1);
- submit_bio(rw | (1 << BIO_RW_SYNC), bio);
- while (atomic_read(&io_done))
- yield();
- if (rw == READ)
- bio_set_pages_dirty(bio);
- Done:
- bio_put(bio);
- return error;
-}
-
-static int bio_read_page(pgoff_t page_off, void *page)
-{
- return submit(READ, page_off, page);
-}
-
-static int bio_write_page(pgoff_t page_off, void *page)
-{
- return submit(WRITE, page_off, page);
-}
-
-/**
- * The following functions allow us to read data using a swap map
- * in a file-alike way
- */
-
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
-{
- if (handle->cur)
- free_page((unsigned long)handle->cur);
- handle->cur = NULL;
-}
-
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
- swp_entry_t start)
-{
- int error;
-
- if (!swp_offset(start))
- return -EINVAL;
- handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
- if (!handle->cur)
- return -ENOMEM;
- error = bio_read_page(swp_offset(start), handle->cur);
- if (error) {
- release_swap_map_reader(handle);
- return error;
- }
- handle->k = 0;
- return 0;
-}
-
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
-{
- unsigned long offset;
- int error;
-
- if (!handle->cur)
- return -EINVAL;
- offset = swp_offset(handle->cur->entries[handle->k]);
- if (!offset)
- return -EINVAL;
- error = bio_read_page(offset, buf);
- if (error)
- return error;
- if (++handle->k >= MAP_PAGE_SIZE) {
- handle->k = 0;
- offset = swp_offset(handle->cur->next_swap);
- if (!offset)
- release_swap_map_reader(handle);
- else
- error = bio_read_page(offset, handle->cur);
- }
- return error;
-}
-
-static int check_header(void)
-{
- char *reason = NULL;
-
- dump_info();
- if (swsusp_info.version_code != LINUX_VERSION_CODE)
- reason = "kernel version";
- if (swsusp_info.num_physpages != num_physpages)
- reason = "memory size";
- if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
- reason = "system type";
- if (strcmp(swsusp_info.uts.release,system_utsname.release))
- reason = "kernel release";
- if (strcmp(swsusp_info.uts.version,system_utsname.version))
- reason = "version";
- if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
- reason = "machine";
- if (reason) {
- printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
- return -EPERM;
- }
- return 0;
-}
-
-/**
- * load_image_data - load the image data using the swap map handle
- * @handle and store them using the page backup list @pblist
- * (assume there are @nr_pages pages to load)
- */
-
-static int load_image_data(struct pbe *pblist,
- struct swap_map_handle *handle,
- unsigned int nr_pages)
-{
- int error;
- unsigned int m;
- struct pbe *p;
-
- if (!pblist)
- return -EINVAL;
- printk("Loading image data pages (%u pages) ... ", nr_pages);
- m = nr_pages / 100;
- if (!m)
- m = 1;
- nr_pages = 0;
- p = pblist;
- while (p) {
- error = swap_map_read_page(handle, (void *)p->address);
- if (error)
- break;
- p = p->next;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
- }
- if (!error)
- printk("\b\b\b\bdone\n");
- return error;
-}
-
-/**
- * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- * the PBEs in the list starting at @pbe
- */
-
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
- struct pbe *pbe)
-{
- int j;
-
- for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
- pbe->orig_address = buf[j];
- pbe = pbe->next;
- }
- return pbe;
-}
-
-/**
- * load_image_metadata - load the image metadata using the swap map
- * handle @handle and put them into the PBEs in the list @pblist
- */
-
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
- struct pbe *p;
- unsigned long *buf;
- unsigned int n = 0;
- int error = 0;
-
- printk("Loading image metadata ... ");
- buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
- if (!buf)
- return -ENOMEM;
- p = pblist;
- while (p) {
- error = swap_map_read_page(handle, buf);
- if (error)
- break;
- p = unpack_orig_addresses(buf, p);
- n++;
- }
- free_page((unsigned long)buf);
- if (!error)
- printk("done (%u pages loaded)\n", n);
- return error;
-}
-
-int swsusp_read(struct pbe **pblist_ptr)
-{
- int error;
- struct pbe *p, *pblist;
- struct swap_map_handle handle;
- unsigned int nr_pages;
-
- if (IS_ERR(resume_bdev)) {
- pr_debug("swsusp: block device not initialised\n");
- return PTR_ERR(resume_bdev);
- }
-
- error = get_swap_map_reader(&handle, swsusp_header.image);
- if (!error)
- error = swap_map_read_page(&handle, &swsusp_info);
- if (!error)
- error = check_header();
- if (error)
- return error;
- nr_pages = swsusp_info.image_pages;
- p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
- if (!p)
- return -ENOMEM;
- error = load_image_metadata(p, &handle);
- if (!error) {
- mark_unsafe_pages(p);
- pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
- if (pblist)
- copy_page_backup_list(pblist, p);
- free_pagedir(p);
- if (!pblist)
- error = -ENOMEM;
-
- /* Allocate memory for the image and read the data from swap */
- if (!error)
- error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
- if (!error) {
- release_eaten_pages();
- error = load_image_data(pblist, &handle, nr_pages);
- }
- if (!error)
- *pblist_ptr = pblist;
- }
- release_swap_map_reader(&handle);
-
- blkdev_put(resume_bdev);
-
- if (!error)
- pr_debug("swsusp: Reading resume file was successful\n");
- else
- pr_debug("swsusp: Error %d resuming\n", error);
- return error;
-}
-
-/**
- * swsusp_check - Check for swsusp signature in the resume device
- */
-
-int swsusp_check(void)
-{
- int error;
-
- resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
- if (!IS_ERR(resume_bdev)) {
- set_blocksize(resume_bdev, PAGE_SIZE);
- memset(&swsusp_header, 0, sizeof(swsusp_header));
- if ((error = bio_read_page(0, &swsusp_header)))
- return error;
- if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
- memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
- /* Reset swap signature now */
- error = bio_write_page(0, &swsusp_header);
- } else {
- return -EINVAL;
- }
- if (error)
- blkdev_put(resume_bdev);
- else
- pr_debug("swsusp: Signature found, resuming\n");
- } else {
- error = PTR_ERR(resume_bdev);
- }
-
- if (error)
- pr_debug("swsusp: Error %d check for resume file\n", error);
-
- return error;
-}
-
-/**
- * swsusp_close - close swap device.
- */
-
-void swsusp_close(void)
-{
- if (IS_ERR(resume_bdev)) {
- pr_debug("swsusp: block device not initialised\n");
- return;
- }
-
- blkdev_put(resume_bdev);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000000..3f1539fbe48a
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR 231
+
+static struct snapshot_data {
+ struct snapshot_handle handle;
+ int swap;
+ struct bitmap_page *bitmap;
+ int mode;
+ char frozen;
+ char ready;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ if (!atomic_add_unless(&device_available, -1, 0))
+ return -EBUSY;
+
+ if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+ return -ENOSYS;
+
+ nonseekable_open(inode, filp);
+ data = &snapshot_state;
+ filp->private_data = data;
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+ data->mode = O_RDONLY;
+ } else {
+ data->swap = -1;
+ data->mode = O_WRONLY;
+ }
+ data->bitmap = NULL;
+ data->frozen = 0;
+ data->ready = 0;
+
+ return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ swsusp_free();
+ data = filp->private_data;
+ free_all_swap_pages(data->swap, data->bitmap);
+ free_bitmap(data->bitmap);
+ if (data->frozen) {
+ down(&pm_sem);
+ thaw_processes();
+ enable_nonboot_cpus();
+ up(&pm_sem);
+ }
+ atomic_inc(&device_available);
+ return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ data = filp->private_data;
+ res = snapshot_read_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_to_user(buf, data_of(data->handle), res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+ return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+
+ data = filp->private_data;
+ res = snapshot_write_next(&data->handle, count);
+ if (res > 0) {
+ if (copy_from_user(data_of(data->handle), buf, res))
+ res = -EFAULT;
+ else
+ *offp = data->handle.offset;
+ }
+ return res;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int error = 0;
+ struct snapshot_data *data;
+ loff_t offset, avail;
+
+ if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+ return -ENOTTY;
+ if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+ return -ENOTTY;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ data = filp->private_data;
+
+ switch (cmd) {
+
+ case SNAPSHOT_FREEZE:
+ if (data->frozen)
+ break;
+ down(&pm_sem);
+ disable_nonboot_cpus();
+ if (freeze_processes()) {
+ thaw_processes();
+ enable_nonboot_cpus();
+ error = -EBUSY;
+ }
+ up(&pm_sem);
+ if (!error)
+ data->frozen = 1;
+ break;
+
+ case SNAPSHOT_UNFREEZE:
+ if (!data->frozen)
+ break;
+ down(&pm_sem);
+ thaw_processes();
+ enable_nonboot_cpus();
+ up(&pm_sem);
+ data->frozen = 0;
+ break;
+
+ case SNAPSHOT_ATOMIC_SNAPSHOT:
+ if (data->mode != O_RDONLY || !data->frozen || data->ready) {
+ error = -EPERM;
+ break;
+ }
+ down(&pm_sem);
+ /* Free memory before shutting down devices. */
+ error = swsusp_shrink_memory();
+ if (!error) {
+ error = device_suspend(PMSG_FREEZE);
+ if (!error) {
+ in_suspend = 1;
+ error = swsusp_suspend();
+ device_resume();
+ }
+ }
+ up(&pm_sem);
+ if (!error)
+ error = put_user(in_suspend, (unsigned int __user *)arg);
+ if (!error)
+ data->ready = 1;
+ break;
+
+ case SNAPSHOT_ATOMIC_RESTORE:
+ if (data->mode != O_WRONLY || !data->frozen ||
+ !snapshot_image_loaded(&data->handle)) {
+ error = -EPERM;
+ break;
+ }
+ down(&pm_sem);
+ pm_prepare_console();
+ error = device_suspend(PMSG_FREEZE);
+ if (!error) {
+ error = swsusp_resume();
+ device_resume();
+ }
+ pm_restore_console();
+ up(&pm_sem);
+ break;
+
+ case SNAPSHOT_FREE:
+ swsusp_free();
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ data->ready = 0;
+ break;
+
+ case SNAPSHOT_SET_IMAGE_SIZE:
+ image_size = arg;
+ break;
+
+ case SNAPSHOT_AVAIL_SWAP:
+ avail = count_swap_pages(data->swap, 1);
+ avail <<= PAGE_SHIFT;
+ error = put_user(avail, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_GET_SWAP_PAGE:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ if (!data->bitmap) {
+ data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+ if (!data->bitmap) {
+ error = -ENOMEM;
+ break;
+ }
+ }
+ offset = alloc_swap_page(data->swap, data->bitmap);
+ if (offset) {
+ offset <<= PAGE_SHIFT;
+ error = put_user(offset, (loff_t __user *)arg);
+ } else {
+ error = -ENOSPC;
+ }
+ break;
+
+ case SNAPSHOT_FREE_SWAP_PAGES:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ free_all_swap_pages(data->swap, data->bitmap);
+ free_bitmap(data->bitmap);
+ data->bitmap = NULL;
+ break;
+
+ case SNAPSHOT_SET_SWAP_FILE:
+ if (!data->bitmap) {
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ if (old_decode_dev(arg)) {
+ data->swap = swap_type_of(old_decode_dev(arg));
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ } else {
+ error = -EPERM;
+ }
+ break;
+
+ case SNAPSHOT_S2RAM:
+ if (!data->frozen) {
+ error = -EPERM;
+ break;
+ }
+
+ if (down_trylock(&pm_sem)) {
+ error = -EBUSY;
+ break;
+ }
+
+ if (pm_ops->prepare) {
+ error = pm_ops->prepare(PM_SUSPEND_MEM);
+ if (error)
+ goto OutS3;
+ }
+
+ /* Put devices to sleep */
+ error = device_suspend(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "Failed to suspend some devices.\n");
+ } else {
+ /* Enter S3, system is already frozen */
+ suspend_enter(PM_SUSPEND_MEM);
+
+ /* Wake up devices */
+ device_resume();
+ }
+
+ if (pm_ops->finish)
+ pm_ops->finish(PM_SUSPEND_MEM);
+
+OutS3:
+ up(&pm_sem);
+ break;
+
+ default:
+ error = -ENOTTY;
+
+ }
+
+ return error;
+}
+
+static struct file_operations snapshot_fops = {
+ .open = snapshot_open,
+ .release = snapshot_release,
+ .read = snapshot_read,
+ .write = snapshot_write,
+ .llseek = no_llseek,
+ .ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+ .minor = SNAPSHOT_MINOR,
+ .name = "snapshot",
+ .fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+ return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/kernel/printk.c b/kernel/printk.c
index 7ba79ad895e4..891f7a714037 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
-/*
- * Setup a list of consoles. Called from init/main.c
- */
-static int __init console_setup(char *str)
-{
- char name[sizeof(console_cmdline[0].name)];
- char *s, *options;
- int idx;
-
- /*
- * Decode str into name, index, options.
- */
- if (str[0] >= '0' && str[0] <= '9') {
- strcpy(name, "ttyS");
- strncpy(name + 4, str, sizeof(name) - 5);
- } else
- strncpy(name, str, sizeof(name) - 1);
- name[sizeof(name) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
- *(options++) = 0;
-#ifdef __sparc__
- if (!strcmp(str, "ttya"))
- strcpy(name, "ttyS0");
- if (!strcmp(str, "ttyb"))
- strcpy(name, "ttyS1");
-#endif
- for (s = name; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
- break;
- idx = simple_strtoul(s, NULL, 10);
- *s = 0;
-
- add_preferred_console(name, idx, options);
- return 1;
-}
-
-__setup("console=", console_setup);
-
static int __init log_buf_len_setup(char *str)
{
unsigned long size = memparse(str, &str);
@@ -659,6 +621,44 @@ static void call_console_drivers(unsigned long start, unsigned long end)
#endif
+/*
+ * Set up a list of consoles. Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+ char name[sizeof(console_cmdline[0].name)];
+ char *s, *options;
+ int idx;
+
+ /*
+ * Decode str into name, index, options.
+ */
+ if (str[0] >= '0' && str[0] <= '9') {
+ strcpy(name, "ttyS");
+ strncpy(name + 4, str, sizeof(name) - 5);
+ } else {
+ strncpy(name, str, sizeof(name) - 1);
+ }
+ name[sizeof(name) - 1] = 0;
+ if ((options = strchr(str, ',')) != NULL)
+ *(options++) = 0;
+#ifdef __sparc__
+ if (!strcmp(str, "ttya"))
+ strcpy(name, "ttyS0");
+ if (!strcmp(str, "ttyb"))
+ strcpy(name, "ttyS1");
+#endif
+ for (s = name; *s; s++)
+ if ((*s >= '0' && *s <= '9') || *s == ',')
+ break;
+ idx = simple_strtoul(s, NULL, 10);
+ *s = 0;
+
+ add_preferred_console(name, idx, options);
+ return 1;
+}
+__setup("console=", console_setup);
+
/**
* add_preferred_console - add a device to the list of preferred consoles.
* @name: device name
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d704..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/profile.h>
#include <linux/highmem.h>
+#include <linux/mutex.h>
#include <asm/sections.h>
#include <asm/semaphore.h>
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DECLARE_MUTEX(profile_flip_mutex);
+static DEFINE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */
static int __init profile_setup(char * str)
@@ -86,72 +87,52 @@ void __init profile_init(void)
#ifdef CONFIG_PROFILING
-static DECLARE_RWSEM(profile_rwsem);
-static DEFINE_RWLOCK(handoff_lock);
-static struct notifier_block * task_exit_notifier;
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
void profile_task_exit(struct task_struct * task)
{
- down_read(&profile_rwsem);
- notifier_call_chain(&task_exit_notifier, 0, task);
- up_read(&profile_rwsem);
+ blocking_notifier_call_chain(&task_exit_notifier, 0, task);
}
int profile_handoff_task(struct task_struct * task)
{
int ret;
- read_lock(&handoff_lock);
- ret = notifier_call_chain(&task_free_notifier, 0, task);
- read_unlock(&handoff_lock);
+ ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
return (ret == NOTIFY_OK) ? 1 : 0;
}
void profile_munmap(unsigned long addr)
{
- down_read(&profile_rwsem);
- notifier_call_chain(&munmap_notifier, 0, (void *)addr);
- up_read(&profile_rwsem);
+ blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
}
int task_handoff_register(struct notifier_block * n)
{
- int err = -EINVAL;
-
- write_lock(&handoff_lock);
- err = notifier_chain_register(&task_free_notifier, n);
- write_unlock(&handoff_lock);
- return err;
+ return atomic_notifier_chain_register(&task_free_notifier, n);
}
int task_handoff_unregister(struct notifier_block * n)
{
- int err = -EINVAL;
-
- write_lock(&handoff_lock);
- err = notifier_chain_unregister(&task_free_notifier, n);
- write_unlock(&handoff_lock);
- return err;
+ return atomic_notifier_chain_unregister(&task_free_notifier, n);
}
int profile_event_register(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
- down_write(&profile_rwsem);
-
switch (type) {
case PROFILE_TASK_EXIT:
- err = notifier_chain_register(&task_exit_notifier, n);
+ err = blocking_notifier_chain_register(
+ &task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
- err = notifier_chain_register(&munmap_notifier, n);
+ err = blocking_notifier_chain_register(
+ &munmap_notifier, n);
break;
}
- up_write(&profile_rwsem);
-
return err;
}
@@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
- down_write(&profile_rwsem);
-
switch (type) {
case PROFILE_TASK_EXIT:
- err = notifier_chain_unregister(&task_exit_notifier, n);
+ err = blocking_notifier_chain_unregister(
+ &task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
- err = notifier_chain_unregister(&munmap_notifier, n);
+ err = blocking_notifier_chain_unregister(
+ &munmap_notifier, n);
break;
}
- up_write(&profile_rwsem);
return err;
}
@@ -243,7 +223,7 @@ static void profile_flip_buffers(void)
{
int i, j, cpu;
- down(&profile_flip_mutex);
+ mutex_lock(&profile_flip_mutex);
j = per_cpu(cpu_profile_flip, get_cpu());
put_cpu();
on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +239,14 @@ static void profile_flip_buffers(void)
hits[i].hits = hits[i].pc = 0;
}
}
- up(&profile_flip_mutex);
+ mutex_unlock(&profile_flip_mutex);
}
static void profile_discard_flip_buffers(void)
{
int i, cpu;
- down(&profile_flip_mutex);
+ mutex_lock(&profile_flip_mutex);
i = per_cpu(cpu_profile_flip, get_cpu());
put_cpu();
on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void)
struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
}
- up(&profile_flip_mutex);
+ mutex_unlock(&profile_flip_mutex);
}
void profile_hit(int type, void *__pc)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c9279d..86a7f6c60cb2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent)
if (child->parent == new_parent)
return;
list_add(&child->ptrace_list, &child->parent->ptrace_children);
- REMOVE_LINKS(child);
+ remove_parent(child);
child->parent = new_parent;
- SET_LINKS(child);
+ add_parent(child);
}
/*
@@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child)
child->ptrace = 0;
if (!list_empty(&child->ptrace_list)) {
list_del_init(&child->ptrace_list);
- REMOVE_LINKS(child);
+ remove_parent(child);
child->parent = child->real_parent;
- SET_LINKS(child);
+ add_parent(child);
}
ptrace_untrace(child);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 8cf15a569fcd..13458bbaa1be 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
+#include <linux/mutex.h>
/* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = SPIN_LOCK_UNLOCKED,
.cpumask = CPU_MASK_NONE,
};
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = SPIN_LOCK_UNLOCKED,
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
#endif
static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
+static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
#ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
void rcu_barrier(void)
{
BUG_ON(in_interrupt());
- /* Take cpucontrol semaphore to protect against CPU hotplug */
- down(&rcu_barrier_sema);
+ /* Take cpucontrol mutex to protect against CPU hotplug */
+ mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 0);
on_each_cpu(rcu_barrier_func, NULL, 0, 1);
wait_for_completion(&rcu_barrier_completion);
- up(&rcu_barrier_sema);
+ mutex_unlock(&rcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -415,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
rdp->curtail = &rdp->curlist;
}
- local_irq_disable();
if (rdp->nxtlist && !rdp->curlist) {
+ local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
@@ -441,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
}
- } else {
- local_irq_enable();
}
+
rcu_check_quiescent_state(rcp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
@@ -549,7 +549,6 @@ static struct notifier_block __devinitdata rcu_nb = {
*/
void __init rcu_init(void)
{
- sema_init(&rcu_barrier_sema, 1);
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
@@ -609,7 +608,7 @@ module_param(qlowmark, int, 0);
module_param(rsinterval, int, 0);
#endif
EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7712912dbc84..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */
static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-MODULE_PARM(nreaders, "i");
+module_param(nreaders, int, 0);
MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-MODULE_PARM(stat_interval, "i");
+module_param(stat_interval, int, 0);
MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-MODULE_PARM(verbose, "i");
+module_param(verbose, bool, 0);
MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-MODULE_PARM(test_no_idle_hz, "i");
+module_param(test_no_idle_hz, bool, 0);
MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-MODULE_PARM(shuffle_interval, "i");
+module_param(shuffle_interval, int, 0);
MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
#define TORTURE_FLAG "rcutorture: "
#define PRINTK_STRING(s) \
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
- for_each_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg)
return 0;
}
+static inline void
+rcu_torture_print_module_parms(char *tag)
+{
+ printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval = %d\n",
+ tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
+ shuffle_interval);
+}
+
static void
rcu_torture_cleanup(void)
{
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void)
rcu_barrier();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
- printk(KERN_ALERT TORTURE_FLAG
- "--- End of test: %s\n",
- atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
+ if (atomic_read(&n_rcu_torture_error))
+ rcu_torture_print_module_parms("End of test: FAILURE");
+ else
+ rcu_torture_print_module_parms("End of test: SUCCESS");
}
static int
@@ -501,11 +512,7 @@ rcu_torture_init(void)
nrealreaders = nreaders;
else
nrealreaders = 2 * num_online_cpus();
- printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
- "stat_interval=%d verbose=%d test_no_idle_hz=%d "
- "shuffle_interval = %d\n",
- nrealreaders, stat_interval, verbose, test_no_idle_hz,
- shuffle_interval);
+ rcu_torture_print_module_parms("Start of test");
fullstop = 0;
/* Set up the freelist. */
@@ -528,7 +535,7 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
atomic_set(&rcu_torture_wcount[i], 0);
- for_each_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
per_cpu(rcu_torture_count, cpu)[i] = 0;
per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 000000000000..33345e73485c
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1012 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = vma->vm_private_data;
+ buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relay file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+ unsigned long address,
+ int *type)
+{
+ struct page *page;
+ struct rchan_buf *buf = vma->vm_private_data;
+ unsigned long offset = address - vma->vm_start;
+
+ if (address > vma->vm_end)
+ return NOPAGE_SIGBUS; /* Disallow mremap */
+ if (!buf)
+ return NOPAGE_OOM;
+
+ page = vmalloc_to_page(buf->start + offset);
+ if (!page)
+ return NOPAGE_OOM;
+ get_page(page);
+
+ if (type)
+ *type = VM_FAULT_MINOR;
+
+ return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+ .nopage = relay_buf_nopage,
+ .close = relay_file_mmap_close,
+};
+
+/**
+ * relay_mmap_buf: - mmap channel buffer to process address space
+ * @buf: relay channel buffer
+ * @vma: vm_area_struct describing memory to be mapped
+ *
+ * Returns 0 if ok, negative on error
+ *
+ * Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+ unsigned long length = vma->vm_end - vma->vm_start;
+ struct file *filp = vma->vm_file;
+
+ if (!buf)
+ return -EBADF;
+
+ if (length != (unsigned long)buf->chan->alloc_size)
+ return -EINVAL;
+
+ vma->vm_ops = &relay_file_mmap_ops;
+ vma->vm_private_data = buf;
+ buf->chan->cb->buf_mapped(buf, filp);
+
+ return 0;
+}
+
+/**
+ * relay_alloc_buf - allocate a channel buffer
+ * @buf: the buffer struct
+ * @size: total size of the buffer
+ *
+ * Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ * passed in size will get page aligned, if it isn't already.
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
+{
+ void *mem;
+ unsigned int i, j, n_pages;
+
+ *size = PAGE_ALIGN(*size);
+ n_pages = *size >> PAGE_SHIFT;
+
+ buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!buf->page_array)
+ return NULL;
+
+ for (i = 0; i < n_pages; i++) {
+ buf->page_array[i] = alloc_page(GFP_KERNEL);
+ if (unlikely(!buf->page_array[i]))
+ goto depopulate;
+ }
+ mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+ if (!mem)
+ goto depopulate;
+
+ memset(mem, 0, *size);
+ buf->page_count = n_pages;
+ return mem;
+
+depopulate:
+ for (j = 0; j < i; j++)
+ __free_page(buf->page_array[j]);
+ kfree(buf->page_array);
+ return NULL;
+}
+
+/**
+ * relay_create_buf - allocate and initialize a channel buffer
+ * @alloc_size: size of the buffer to allocate
+ * @n_subbufs: number of sub-buffers in the channel
+ *
+ * Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+ struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+ if (!buf->padding)
+ goto free_buf;
+
+ buf->start = relay_alloc_buf(buf, &chan->alloc_size);
+ if (!buf->start)
+ goto free_buf;
+
+ buf->chan = chan;
+ kref_get(&buf->chan->kref);
+ return buf;
+
+free_buf:
+ kfree(buf->padding);
+ kfree(buf);
+ return NULL;
+}
+
+/**
+ * relay_destroy_channel - free the channel struct
+ *
+ * Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+ struct rchan *chan = container_of(kref, struct rchan, kref);
+ kfree(chan);
+}
+
+/**
+ * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ * @buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+ struct rchan *chan = buf->chan;
+ unsigned int i;
+
+ if (likely(buf->start)) {
+ vunmap(buf->start);
+ for (i = 0; i < buf->page_count; i++)
+ __free_page(buf->page_array[i]);
+ kfree(buf->page_array);
+ }
+ kfree(buf->padding);
+ kfree(buf);
+ kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ * relay_remove_buf - remove a channel buffer
+ *
+ * Removes the file from the fileystem, which also frees the
+ * rchan_buf_struct and the channel buffer. Should only be called from
+ * kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+ struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+ buf->chan->cb->remove_buf_file(buf->dentry);
+ relay_destroy_buf(buf);
+}
+
+/**
+ * relay_buf_empty - boolean, is the channel buffer empty?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+ return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(relay_buf_empty);
+
+/**
+ * relay_buf_full - boolean, is the channel buffer full?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+ size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+ return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior. Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback. Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ size_t prev_padding)
+{
+ if (relay_buf_full(buf))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * buf_mapped() default callback. Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback. Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback. Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+ struct dentry *parent,
+ int mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return NULL;
+}
+
+/*
+ * remove_buf_file() default callback. Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+ return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+ .subbuf_start = subbuf_start_default_callback,
+ .buf_mapped = buf_mapped_default_callback,
+ .buf_unmapped = buf_unmapped_default_callback,
+ .create_buf_file = create_buf_file_default_callback,
+ .remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ * wakeup_readers - wake up readers waiting on a channel
+ * @private: the channel buffer
+ *
+ * This is the work function used to defer reader waking. The
+ * reason waking is deferred is that calling directly from write
+ * causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+ struct rchan_buf *buf = private;
+ wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ * __relay_reset - reset a channel buffer
+ * @buf: the channel buffer
+ * @init: 1 if this is a first-time initialization
+ *
+ * See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+ size_t i;
+
+ if (init) {
+ init_waitqueue_head(&buf->read_wait);
+ kref_init(&buf->kref);
+ INIT_WORK(&buf->wake_readers, NULL, NULL);
+ } else {
+ cancel_delayed_work(&buf->wake_readers);
+ flush_scheduled_work();
+ }
+
+ buf->subbufs_produced = 0;
+ buf->subbufs_consumed = 0;
+ buf->bytes_consumed = 0;
+ buf->finalized = 0;
+ buf->data = buf->start;
+ buf->offset = 0;
+
+ for (i = 0; i < buf->chan->n_subbufs; i++)
+ buf->padding[i] = 0;
+
+ buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ * relay_reset - reset the channel
+ * @chan: the channel
+ *
+ * This has the effect of erasing all data from all channel buffers
+ * and restarting the channel in its initial state. The buffers
+ * are not freed, so any mappings are still in effect.
+ *
+ * NOTE: Care should be taken that the channel isn't actually
+ * being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+ unsigned int i;
+ struct rchan_buf *prev = NULL;
+
+ if (!chan)
+ return;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!chan->buf[i] || chan->buf[i] == prev)
+ break;
+ __relay_reset(chan->buf[i], 0);
+ prev = chan->buf[i];
+ }
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+/**
+ * relay_open_buf - create a new relay channel buffer
+ *
+ * Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+ const char *filename,
+ struct dentry *parent,
+ int *is_global)
+{
+ struct rchan_buf *buf;
+ struct dentry *dentry;
+
+ if (*is_global)
+ return chan->buf[0];
+
+ buf = relay_create_buf(chan);
+ if (!buf)
+ return NULL;
+
+ /* Create file in fs */
+ dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+ buf, is_global);
+ if (!dentry) {
+ relay_destroy_buf(buf);
+ return NULL;
+ }
+
+ buf->dentry = dentry;
+ __relay_reset(buf, 1);
+
+ return buf;
+}
+
+/**
+ * relay_close_buf - close a channel buffer
+ * @buf: channel buffer
+ *
+ * Marks the buffer finalized and restores the default callbacks.
+ * The channel buffer and channel buffer data structure are then freed
+ * automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+ buf->finalized = 1;
+ cancel_delayed_work(&buf->wake_readers);
+ flush_scheduled_work();
+ kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+ struct rchan_callbacks *cb)
+{
+ if (!cb) {
+ chan->cb = &default_channel_callbacks;
+ return;
+ }
+
+ if (!cb->subbuf_start)
+ cb->subbuf_start = subbuf_start_default_callback;
+ if (!cb->buf_mapped)
+ cb->buf_mapped = buf_mapped_default_callback;
+ if (!cb->buf_unmapped)
+ cb->buf_unmapped = buf_unmapped_default_callback;
+ if (!cb->create_buf_file)
+ cb->create_buf_file = create_buf_file_default_callback;
+ if (!cb->remove_buf_file)
+ cb->remove_buf_file = remove_buf_file_default_callback;
+ chan->cb = cb;
+}
+
+/**
+ * relay_open - create a new relay channel
+ * @base_filename: base name of files to create
+ * @parent: dentry of parent directory, NULL for root directory
+ * @subbuf_size: size of sub-buffers
+ * @n_subbufs: number of sub-buffers
+ * @cb: client callback functions
+ *
+ * Returns channel pointer if successful, NULL otherwise.
+ *
+ * Creates a channel buffer for each cpu using the sizes and
+ * attributes specified. The created channel buffer files
+ * will be named base_filename0...base_filenameN-1. File
+ * permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+ struct dentry *parent,
+ size_t subbuf_size,
+ size_t n_subbufs,
+ struct rchan_callbacks *cb)
+{
+ unsigned int i;
+ struct rchan *chan;
+ char *tmpname;
+ int is_global = 0;
+
+ if (!base_filename)
+ return NULL;
+
+ if (!(subbuf_size && n_subbufs))
+ return NULL;
+
+ chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+ if (!chan)
+ return NULL;
+
+ chan->version = RELAYFS_CHANNEL_VERSION;
+ chan->n_subbufs = n_subbufs;
+ chan->subbuf_size = subbuf_size;
+ chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ setup_callbacks(chan, cb);
+ kref_init(&chan->kref);
+
+ tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ goto free_chan;
+
+ for_each_online_cpu(i) {
+ sprintf(tmpname, "%s%d", base_filename, i);
+ chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+ &is_global);
+ if (!chan->buf[i])
+ goto free_bufs;
+
+ chan->buf[i]->cpu = i;
+ }
+
+ kfree(tmpname);
+ return chan;
+
+free_bufs:
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!chan->buf[i])
+ break;
+ relay_close_buf(chan->buf[i]);
+ if (is_global)
+ break;
+ }
+ kfree(tmpname);
+
+free_chan:
+ kref_put(&chan->kref, relay_destroy_channel);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+/**
+ * relay_switch_subbuf - switch to a new sub-buffer
+ * @buf: channel buffer
+ * @length: size of current event
+ *
+ * Returns either the length passed in or 0 if full.
+ *
+ * Performs sub-buffer-switch tasks such as invoking callbacks,
+ * updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+ void *old, *new;
+ size_t old_subbuf, new_subbuf;
+
+ if (unlikely(length > buf->chan->subbuf_size))
+ goto toobig;
+
+ if (buf->offset != buf->chan->subbuf_size + 1) {
+ buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ buf->padding[old_subbuf] = buf->prev_padding;
+ buf->subbufs_produced++;
+ buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ smp_mb();
+ if (waitqueue_active(&buf->read_wait)) {
+ PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+ schedule_delayed_work(&buf->wake_readers, 1);
+ }
+ }
+
+ old = buf->data;
+ new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ new = buf->start + new_subbuf * buf->chan->subbuf_size;
+ buf->offset = 0;
+ if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ buf->offset = buf->chan->subbuf_size + 1;
+ return 0;
+ }
+ buf->data = new;
+ buf->padding[new_subbuf] = 0;
+
+ if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+ goto toobig;
+
+ return length;
+
+toobig:
+ buf->chan->last_toobig = length;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ * @chan: the channel
+ * @cpu: the cpu associated with the channel buffer to update
+ * @subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ * Adds to the channel buffer's consumed sub-buffer count.
+ * subbufs_consumed should be the number of sub-buffers newly consumed,
+ * not the total consumed.
+ *
+ * NOTE: kernel clients don't need to call this function if the channel
+ * mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+ unsigned int cpu,
+ size_t subbufs_consumed)
+{
+ struct rchan_buf *buf;
+
+ if (!chan)
+ return;
+
+ if (cpu >= NR_CPUS || !chan->buf[cpu])
+ return;
+
+ buf = chan->buf[cpu];
+ buf->subbufs_consumed += subbufs_consumed;
+ if (buf->subbufs_consumed > buf->subbufs_produced)
+ buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ * relay_close - close the channel
+ * @chan: the channel
+ *
+ * Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+ unsigned int i;
+ struct rchan_buf *prev = NULL;
+
+ if (!chan)
+ return;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!chan->buf[i] || chan->buf[i] == prev)
+ break;
+ relay_close_buf(chan->buf[i]);
+ prev = chan->buf[i];
+ }
+
+ if (chan->last_toobig)
+ printk(KERN_WARNING "relay: one or more items not logged "
+ "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+ chan->last_toobig, chan->subbuf_size);
+
+ kref_put(&chan->kref, relay_destroy_channel);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ * relay_flush - close the channel
+ * @chan: the channel
+ *
+ * Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+ unsigned int i;
+ struct rchan_buf *prev = NULL;
+
+ if (!chan)
+ return;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!chan->buf[i] || chan->buf[i] == prev)
+ break;
+ relay_switch_subbuf(chan->buf[i], 0);
+ prev = chan->buf[i];
+ }
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ * relay_file_open - open file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = inode->u.generic_ip;
+ kref_get(&buf->kref);
+ filp->private_data = buf;
+
+ return 0;
+}
+
+/**
+ * relay_file_mmap - mmap file op for relay files
+ * @filp: the file
+ * @vma: the vma describing what to map
+ *
+ * Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = filp->private_data;
+ return relay_mmap_buf(buf, vma);
+}
+
+/**
+ * relay_file_poll - poll file op for relay files
+ * @filp: the file
+ * @wait: poll table
+ *
+ * Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+ unsigned int mask = 0;
+ struct rchan_buf *buf = filp->private_data;
+
+ if (buf->finalized)
+ return POLLERR;
+
+ if (filp->f_mode & FMODE_READ) {
+ poll_wait(filp, &buf->read_wait, wait);
+ if (!relay_buf_empty(buf))
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ return mask;
+}
+
+/**
+ * relay_file_release - release file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Decrements the channel refcount, as the filesystem is
+ * no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = filp->private_data;
+ kref_put(&buf->kref, relay_remove_buf);
+
+ return 0;
+}
+
+/**
+ * relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t bytes_consumed)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t read_subbuf;
+
+ if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+
+ buf->bytes_consumed += bytes_consumed;
+ read_subbuf = read_pos / buf->chan->subbuf_size;
+ if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+ if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+ (buf->offset == subbuf_size))
+ return;
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+}
+
+/**
+ * relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t produced = buf->subbufs_produced;
+ size_t consumed = buf->subbufs_consumed;
+
+ relay_file_read_consume(buf, read_pos, 0);
+
+ if (unlikely(buf->offset > subbuf_size)) {
+ if (produced == consumed)
+ return 0;
+ return 1;
+ }
+
+ if (unlikely(produced - consumed >= n_subbufs)) {
+ consumed = (produced / n_subbufs) * n_subbufs;
+ buf->subbufs_consumed = consumed;
+ }
+
+ produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+ consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+
+ if (consumed > produced)
+ produced += n_subbufs * subbuf_size;
+
+ if (consumed == produced)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t padding, avail = 0;
+ size_t read_subbuf, read_offset, write_subbuf, write_offset;
+ size_t subbuf_size = buf->chan->subbuf_size;
+
+ write_subbuf = (buf->data - buf->start) / subbuf_size;
+ write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+ read_subbuf = read_pos / subbuf_size;
+ read_offset = read_pos % subbuf_size;
+ padding = buf->padding[read_subbuf];
+
+ if (read_subbuf == write_subbuf) {
+ if (read_offset + padding < write_offset)
+ avail = write_offset - (read_offset + padding);
+ } else
+ avail = (subbuf_size - padding) - read_offset;
+
+ return avail;
+}
+
+/**
+ * relay_file_read_start_pos - find the first available byte to read
+ *
+ * If the read_pos is in the middle of padding, return the
+ * position of the first actually available byte, otherwise
+ * return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t read_subbuf, padding, padding_start, padding_end;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ padding_start = (read_subbuf + 1) * subbuf_size - padding;
+ padding_end = (read_subbuf + 1) * subbuf_size;
+ if (read_pos >= padding_start && read_pos < padding_end) {
+ read_subbuf = (read_subbuf + 1) % n_subbufs;
+ read_pos = read_subbuf * subbuf_size;
+ }
+
+ return read_pos;
+}
+
+/**
+ * relay_file_read_end_pos - return the new read position
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t count)
+{
+ size_t read_subbuf, padding, end_pos;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ if (read_pos % subbuf_size + count + padding == subbuf_size)
+ end_pos = (read_subbuf + 1) * subbuf_size;
+ else
+ end_pos = read_pos + count;
+ if (end_pos >= subbuf_size * n_subbufs)
+ end_pos = 0;
+
+ return end_pos;
+}
+
+/**
+ * subbuf_read_actor - read up to one subbuf's worth of data
+ */
+static int subbuf_read_actor(size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc,
+ read_actor_t actor)
+{
+ void *from;
+ int ret = 0;
+
+ from = buf->start + read_start;
+ ret = avail;
+ if (copy_to_user(desc->arg.data, from, avail)) {
+ desc->error = -EFAULT;
+ ret = 0;
+ }
+ desc->arg.data += ret;
+ desc->written += ret;
+ desc->count -= ret;
+
+ return ret;
+}
+
+/**
+ * subbuf_send_actor - send up to one subbuf's worth of data
+ */
+static int subbuf_send_actor(size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc,
+ read_actor_t actor)
+{
+ unsigned long pidx, poff;
+ unsigned int subbuf_pages;
+ int ret = 0;
+
+ subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
+ pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+ poff = read_start & ~PAGE_MASK;
+ while (avail) {
+ struct page *p = buf->page_array[pidx];
+ unsigned int len;
+
+ len = PAGE_SIZE - poff;
+ if (len > avail)
+ len = avail;
+
+ len = actor(desc, p, poff, len);
+ if (desc->error)
+ break;
+
+ avail -= len;
+ ret += len;
+ poff = 0;
+ pidx = (pidx + 1) % subbuf_pages;
+ }
+
+ return ret;
+}
+
+typedef int (*subbuf_actor_t) (size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc,
+ read_actor_t actor);
+
+/**
+ * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static inline ssize_t relay_file_read_subbufs(struct file *filp,
+ loff_t *ppos,
+ size_t count,
+ subbuf_actor_t subbuf_actor,
+ read_actor_t actor,
+ void *target)
+{
+ struct rchan_buf *buf = filp->private_data;
+ size_t read_start, avail;
+ read_descriptor_t desc;
+ int ret;
+
+ if (!count)
+ return 0;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.data = target;
+ desc.error = 0;
+
+ mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+ do {
+ if (!relay_file_read_avail(buf, *ppos))
+ break;
+
+ read_start = relay_file_read_start_pos(*ppos, buf);
+ avail = relay_file_read_subbuf_avail(read_start, buf);
+ if (!avail)
+ break;
+
+ avail = min(desc.count, avail);
+ ret = subbuf_actor(read_start, buf, avail, &desc, actor);
+ if (desc.error < 0)
+ break;
+
+ if (ret) {
+ relay_file_read_consume(buf, read_start, ret);
+ *ppos = relay_file_read_end_pos(buf, read_start, ret);
+ }
+ } while (desc.count && ret);
+ mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+
+ return desc.written;
+}
+
+static ssize_t relay_file_read(struct file *filp,
+ char __user *buffer,
+ size_t count,
+ loff_t *ppos)
+{
+ return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
+ NULL, buffer);
+}
+
+static ssize_t relay_file_sendfile(struct file *filp,
+ loff_t *ppos,
+ size_t count,
+ read_actor_t actor,
+ void *target)
+{
+ return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
+ actor, target);
+}
+
+struct file_operations relay_file_operations = {
+ .open = relay_file_open,
+ .poll = relay_file_poll,
+ .mmap = relay_file_mmap,
+ .read = relay_file_read,
+ .llseek = no_llseek,
+ .release = relay_file_release,
+ .sendfile = relay_file_sendfile,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59c3..a9ecac398bb9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/acct.h>
+#include <linux/kprobes.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
@@ -144,7 +145,8 @@
(v1) * (v2_max) / (v1_max)
#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+ (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+ INTERACTIVE_DELTA)
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
@@ -237,6 +239,7 @@ struct runqueue {
task_t *migration_thread;
struct list_head migration_queue;
+ int cpu;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -707,12 +710,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
DEF_TIMESLICE);
} else {
/*
- * The lower the sleep avg a task has the more
- * rapidly it will rise with sleep time.
- */
- sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
- /*
* Tasks waking from uninterruptible sleep are
* limited in their sleep_avg rise as they
* are likely to be waiting on I/O
@@ -1551,8 +1548,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
- if (unlikely(prev_task_flags & PF_DEAD))
+ if (unlikely(prev_task_flags & PF_DEAD)) {
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
put_task_struct(prev);
+ }
}
/**
@@ -1622,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
{
unsigned long i, sum = 0;
- for_each_cpu(i)
+ for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_uninterruptible;
/*
@@ -1639,7 +1642,7 @@ unsigned long long nr_context_switches(void)
{
unsigned long long i, sum = 0;
- for_each_cpu(i)
+ for_each_possible_cpu(i)
sum += cpu_rq(i)->nr_switches;
return sum;
@@ -1649,7 +1652,7 @@ unsigned long nr_iowait(void)
{
unsigned long i, sum = 0;
- for_each_cpu(i)
+ for_each_possible_cpu(i)
sum += atomic_read(&cpu_rq(i)->nr_iowait);
return sum;
@@ -1660,6 +1663,9 @@ unsigned long nr_iowait(void)
/*
* double_rq_lock - safely lock two runqueues
*
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
@@ -1671,7 +1677,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
spin_lock(&rq1->lock);
__acquire(rq2->lock); /* Fake it out ;) */
} else {
- if (rq1 < rq2) {
+ if (rq1->cpu < rq2->cpu) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
@@ -1707,7 +1713,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
__acquires(this_rq->lock)
{
if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
+ if (busiest->cpu < this_rq->cpu) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
spin_lock(&this_rq->lock);
@@ -2873,13 +2879,11 @@ asmlinkage void __sched schedule(void)
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (likely(!current->exit_state)) {
- if (unlikely(in_atomic())) {
- printk(KERN_ERR "scheduling while atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(), current->pid);
- dump_stack();
- }
+ if (unlikely(in_atomic() && !current->exit_state)) {
+ printk(KERN_ERR "BUG: scheduling while atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(), current->pid);
+ dump_stack();
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5570,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+ return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+ return cpu;
+}
+#endif
+
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
static int cpu_to_phys_group(int cpu)
{
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+ cpumask_t mask = cpu_coregroup_map(cpu);
+ return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
return first_cpu(cpu_sibling_map[cpu]);
#else
return cpu;
@@ -5597,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu)
{
return cpu_to_node(cpu);
}
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+ struct sched_group *sg = group_head;
+ int j;
+
+ if (!sg)
+ return;
+next_sg:
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+
+ sg->cpu_power += sd->groups->cpu_power;
+ }
+ sg = sg->next;
+ if (sg != group_head)
+ goto next_sg;
+}
#endif
/*
@@ -5672,6 +5722,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
sd->parent = p;
sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+ p = sd;
+ sd = &per_cpu(core_domains, i);
+ group = cpu_to_core_group(i);
+ *sd = SD_MC_INIT;
+ sd->span = cpu_coregroup_map(i);
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ sd->groups = &sched_group_core[group];
+#endif
+
#ifdef CONFIG_SCHED_SMT
p = sd;
sd = &per_cpu(cpu_domains, i);
@@ -5697,6 +5758,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
}
#endif
+#ifdef CONFIG_SCHED_MC
+ /* Set up multi-core groups */
+ for_each_cpu_mask(i, *cpu_map) {
+ cpumask_t this_core_map = cpu_coregroup_map(i);
+ cpus_and(this_core_map, this_core_map, *cpu_map);
+ if (i != first_cpu(this_core_map))
+ continue;
+ init_sched_build_groups(sched_group_core, this_core_map,
+ &cpu_to_core_group);
+ }
+#endif
+
+
/* Set up physical groups */
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
@@ -5793,51 +5867,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
power = SCHED_LOAD_SCALE;
sd->groups->cpu_power = power;
#endif
+#ifdef CONFIG_SCHED_MC
+ sd = &per_cpu(core_domains, i);
+ power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+ * SCHED_LOAD_SCALE / 10;
+ sd->groups->cpu_power = power;
+
+ sd = &per_cpu(phys_domains, i);
+ /*
+ * This has to be < 2 * SCHED_LOAD_SCALE
+ * Lets keep it SCHED_LOAD_SCALE, so that
+ * while calculating NUMA group's cpu_power
+ * we can simply do
+ * numa_group->cpu_power += phys_group->cpu_power;
+ *
+ * See "only add power once for each physical pkg"
+ * comment below
+ */
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
sd = &per_cpu(phys_domains, i);
power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
(cpus_weight(sd->groups->cpumask)-1) / 10;
sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
- sd = &per_cpu(allnodes_domains, i);
- if (sd->groups) {
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
- sd->groups->cpu_power = power;
- }
#endif
}
#ifdef CONFIG_NUMA
- for (i = 0; i < MAX_NUMNODES; i++) {
- struct sched_group *sg = sched_group_nodes[i];
- int j;
-
- if (sg == NULL)
- continue;
-next_sg:
- for_each_cpu_mask(j, sg->cpumask) {
- struct sched_domain *sd;
- int power;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ init_numa_sched_groups_power(sched_group_nodes[i]);
- sd = &per_cpu(phys_domains, j);
- if (j != first_cpu(sd->groups->cpumask)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
-
- sg->cpu_power += power;
- }
- sg = sg->next;
- if (sg != sched_group_nodes[i])
- goto next_sg;
- }
+ init_numa_sched_groups_power(sched_group_allnodes);
#endif
/* Attach the domains */
@@ -5845,6 +5906,8 @@ next_sg:
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+ sd = &per_cpu(core_domains, i);
#else
sd = &per_cpu(phys_domains, i);
#endif
@@ -6017,7 +6080,7 @@ void __init sched_init(void)
runqueue_t *rq;
int i, j, k;
- for_each_cpu(i) {
+ for_each_possible_cpu(i) {
prio_array_t *array;
rq = cpu_rq(i);
@@ -6035,6 +6098,7 @@ void __init sched_init(void)
rq->push_cpu = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
+ rq->cpu = i;
#endif
atomic_set(&rq->nr_iowait, 0);
@@ -6075,7 +6139,7 @@ void __might_sleep(char *file, int line)
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
- printk(KERN_ERR "Debug: sleeping function called from invalid"
+ printk(KERN_ERR "BUG: sleeping function called from invalid"
" context at %s:%d\n", file, line);
printk("in_atomic():%d, irqs_disabled():%d\n",
in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a00b..4922928d91f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
-#include <linux/posix-timers.h>
#include <linux/signal.h>
#include <linux/audit.h>
#include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
#define sig_kernel_stop(sig) \
(((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
+#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
+
#define sig_user_defined(t, signr) \
(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
kmem_cache_free(sigqueue_cachep, q);
}
-static void flush_sigqueue(struct sigpending *queue)
+void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q;
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
/*
* Flush all pending signals for a task.
*/
-
-void
-flush_signals(struct task_struct *t)
+void flush_signals(struct task_struct *t)
{
unsigned long flags;
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
}
/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
- struct sighand_struct * sighand = tsk->sighand;
-
- /* Ok, we're done with the signal handlers */
- tsk->sighand = NULL;
- if (atomic_dec_and_test(&sighand->count))
- sighand_free(sighand);
-}
-
-void exit_sighand(struct task_struct *tsk)
-{
- write_lock_irq(&tasklist_lock);
- rcu_read_lock();
- if (tsk->sighand != NULL) {
- struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
- spin_lock(&sighand->siglock);
- __exit_sighand(tsk);
- spin_unlock(&sighand->siglock);
- }
- rcu_read_unlock();
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_signal(struct task_struct *tsk)
-{
- struct signal_struct * sig = tsk->signal;
- struct sighand_struct * sighand;
-
- if (!sig)
- BUG();
- if (!atomic_read(&sig->count))
- BUG();
- rcu_read_lock();
- sighand = rcu_dereference(tsk->sighand);
- spin_lock(&sighand->siglock);
- posix_cpu_timers_exit(tsk);
- if (atomic_dec_and_test(&sig->count)) {
- posix_cpu_timers_exit_group(tsk);
- tsk->signal = NULL;
- __exit_sighand(tsk);
- spin_unlock(&sighand->siglock);
- flush_sigqueue(&sig->shared_pending);
- } else {
- /*
- * If there is any task waiting for the group exit
- * then notify it:
- */
- if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
- wake_up_process(sig->group_exit_task);
- sig->group_exit_task = NULL;
- }
- if (tsk == sig->curr_target)
- sig->curr_target = next_thread(tsk);
- tsk->signal = NULL;
- /*
- * Accumulate here the counters for all threads but the
- * group leader as they die, so they can be added into
- * the process-wide totals when those are taken.
- * The group leader stays around as a zombie as long
- * as there are other threads. When it gets reaped,
- * the exit.c code will add its counts into these totals.
- * We won't ever get here for the group leader, since it
- * will have been the last reference on the signal_struct.
- */
- sig->utime = cputime_add(sig->utime, tsk->utime);
- sig->stime = cputime_add(sig->stime, tsk->stime);
- sig->min_flt += tsk->min_flt;
- sig->maj_flt += tsk->maj_flt;
- sig->nvcsw += tsk->nvcsw;
- sig->nivcsw += tsk->nivcsw;
- sig->sched_time += tsk->sched_time;
- __exit_sighand(tsk);
- spin_unlock(&sighand->siglock);
- sig = NULL; /* Marker for below. */
- }
- rcu_read_unlock();
- clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- flush_sigqueue(&tsk->pending);
- if (sig) {
- /*
- * We are cleaning up the signal_struct here.
- */
- exit_thread_group_keys(sig);
- kmem_cache_free(signal_cachep, sig);
- }
-}
-
-void exit_signal(struct task_struct *tsk)
-{
- atomic_dec(&tsk->signal->live);
-
- write_lock_irq(&tasklist_lock);
- __exit_signal(tsk);
- write_unlock_irq(&tasklist_lock);
-}
-
-/*
* Flush all handlers for a task.
*/
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
}
/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk,
- int to_self,
- int why);
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
/*
* Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
p->signal->group_stop_count = 0;
p->signal->flags = SIGNAL_STOP_CONTINUED;
spin_unlock(&p->sighand->siglock);
- do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
+ do_notify_parent_cldstop(p, CLD_STOPPED);
spin_lock(&p->sighand->siglock);
}
rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
p->signal->flags = SIGNAL_STOP_CONTINUED;
p->signal->group_exit_code = 0;
spin_unlock(&p->sighand->siglock);
- do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
+ do_notify_parent_cldstop(p, CLD_CONTINUED);
spin_lock(&p->sighand->siglock);
} else {
/*
@@ -1120,27 +1014,37 @@ void zap_other_threads(struct task_struct *p)
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+ struct sighand_struct *sighand;
+
+ for (;;) {
+ sighand = rcu_dereference(tsk->sighand);
+ if (unlikely(sighand == NULL))
+ break;
+
+ spin_lock_irqsave(&sighand->siglock, *flags);
+ if (likely(sighand == tsk->sighand))
+ break;
+ spin_unlock_irqrestore(&sighand->siglock, *flags);
+ }
+
+ return sighand;
+}
+
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
unsigned long flags;
- struct sighand_struct *sp;
int ret;
-retry:
ret = check_kill_permission(sig, info, p);
- if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
- spin_lock_irqsave(&sp->siglock, flags);
- if (p->sighand != sp) {
- spin_unlock_irqrestore(&sp->siglock, flags);
- goto retry;
- }
- if ((atomic_read(&sp->count) == 0) ||
- (atomic_read(&p->usage) == 0)) {
- spin_unlock_irqrestore(&sp->siglock, flags);
- return -ESRCH;
+
+ if (!ret && sig) {
+ ret = -ESRCH;
+ if (lock_task_sighand(p, &flags)) {
+ ret = __group_send_sig_info(sig, info, p);
+ unlock_task_sighand(p, &flags);
}
- ret = __group_send_sig_info(sig, info, p);
- spin_unlock_irqrestore(&sp->siglock, flags);
}
return ret;
@@ -1189,7 +1093,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
struct task_struct *p;
rcu_read_lock();
- if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+ if (unlikely(sig_needs_tasklist(sig))) {
read_lock(&tasklist_lock);
acquired_tasklist_lock = 1;
}
@@ -1405,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q)
__sigqueue_free(q);
}
-int
-send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
+int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
{
unsigned long flags;
int ret = 0;
- struct sighand_struct *sh;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1424,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
*/
rcu_read_lock();
- if (unlikely(p->flags & PF_EXITING)) {
+ if (!likely(lock_task_sighand(p, &flags))) {
ret = -1;
goto out_err;
}
-retry:
- sh = rcu_dereference(p->sighand);
-
- spin_lock_irqsave(&sh->siglock, flags);
- if (p->sighand != sh) {
- /* We raced with exec() in a multithreaded process... */
- spin_unlock_irqrestore(&sh->siglock, flags);
- goto retry;
- }
-
- /*
- * We do the check here again to handle the following scenario:
- *
- * CPU 0 CPU 1
- * send_sigqueue
- * check PF_EXITING
- * interrupt exit code running
- * __exit_signal
- * lock sighand->siglock
- * unlock sighand->siglock
- * lock sh->siglock
- * add(tsk->pending) flush_sigqueue(tsk->pending)
- *
- */
-
- if (unlikely(p->flags & PF_EXITING)) {
- ret = -1;
- goto out;
- }
-
if (unlikely(!list_empty(&q->list))) {
/*
* If an SI_TIMER entry is already queue just increment
* the overrun count.
*/
- if (q->info.si_code != SI_TIMER)
- BUG();
+ BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
goto out;
}
@@ -1481,7 +1352,7 @@ retry:
signal_wake_up(p, sig == SIGKILL);
out:
- spin_unlock_irqrestore(&sh->siglock, flags);
+ unlock_task_sighand(p, &flags);
out_err:
rcu_read_unlock();
@@ -1613,14 +1484,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
spin_unlock_irqrestore(&psig->siglock, flags);
}
-static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
{
struct siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- if (to_self)
+ if (tsk->ptrace & PT_PTRACED)
parent = tsk->parent;
else {
tsk = tsk->group_leader;
@@ -1695,7 +1566,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
!(current->ptrace & PT_ATTACHED)) &&
(likely(current->parent->signal != current->signal) ||
!unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
- do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
+ do_notify_parent_cldstop(current, CLD_TRAPPED);
read_unlock(&tasklist_lock);
schedule();
} else {
@@ -1744,25 +1615,17 @@ void ptrace_notify(int exit_code)
static void
finish_stop(int stop_count)
{
- int to_self;
-
/*
* If there are no other threads in the group, or if there is
* a group stop in progress and we are the last to stop,
* report to the parent. When ptraced, every thread reports itself.
*/
- if (stop_count < 0 || (current->ptrace & PT_PTRACED))
- to_self = 1;
- else if (stop_count == 0)
- to_self = 0;
- else
- goto out;
-
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
- read_unlock(&tasklist_lock);
+ if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, CLD_STOPPED);
+ read_unlock(&tasklist_lock);
+ }
-out:
schedule();
/*
* Now we don't run again until continued.
@@ -1776,12 +1639,10 @@ out:
* Returns nonzero if we've actually stopped and released the siglock.
* Returns zero if we didn't stop and still hold the siglock.
*/
-static int
-do_signal_stop(int signr)
+static int do_signal_stop(int signr)
{
struct signal_struct *sig = current->signal;
- struct sighand_struct *sighand = current->sighand;
- int stop_count = -1;
+ int stop_count;
if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
return 0;
@@ -1791,86 +1652,37 @@ do_signal_stop(int signr)
* There is a group stop in progress. We don't need to
* start another one.
*/
- signr = sig->group_exit_code;
stop_count = --sig->group_stop_count;
- current->exit_code = signr;
- set_current_state(TASK_STOPPED);
- if (stop_count == 0)
- sig->flags = SIGNAL_STOP_STOPPED;
- spin_unlock_irq(&sighand->siglock);
- }
- else if (thread_group_empty(current)) {
- /*
- * Lock must be held through transition to stopped state.
- */
- current->exit_code = current->signal->group_exit_code = signr;
- set_current_state(TASK_STOPPED);
- sig->flags = SIGNAL_STOP_STOPPED;
- spin_unlock_irq(&sighand->siglock);
- }
- else {
+ } else {
/*
* There is no group stop already in progress.
- * We must initiate one now, but that requires
- * dropping siglock to get both the tasklist lock
- * and siglock again in the proper order. Note that
- * this allows an intervening SIGCONT to be posted.
- * We need to check for that and bail out if necessary.
+ * We must initiate one now.
*/
struct task_struct *t;
- spin_unlock_irq(&sighand->siglock);
-
- /* signals can be posted during this window */
+ sig->group_exit_code = signr;
- read_lock(&tasklist_lock);
- spin_lock_irq(&sighand->siglock);
-
- if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
+ stop_count = 0;
+ for (t = next_thread(current); t != current; t = next_thread(t))
/*
- * Another stop or continue happened while we
- * didn't have the lock. We can just swallow this
- * signal now. If we raced with a SIGCONT, that
- * should have just cleared it now. If we raced
- * with another processor delivering a stop signal,
- * then the SIGCONT that wakes us up should clear it.
+ * Setting state to TASK_STOPPED for a group
+ * stop is always done with the siglock held,
+ * so this check has no races.
*/
- read_unlock(&tasklist_lock);
- return 0;
- }
-
- if (sig->group_stop_count == 0) {
- sig->group_exit_code = signr;
- stop_count = 0;
- for (t = next_thread(current); t != current;
- t = next_thread(t))
- /*
- * Setting state to TASK_STOPPED for a group
- * stop is always done with the siglock held,
- * so this check has no races.
- */
- if (!t->exit_state &&
- !(t->state & (TASK_STOPPED|TASK_TRACED))) {
- stop_count++;
- signal_wake_up(t, 0);
- }
- sig->group_stop_count = stop_count;
- }
- else {
- /* A race with another thread while unlocked. */
- signr = sig->group_exit_code;
- stop_count = --sig->group_stop_count;
- }
-
- current->exit_code = signr;
- set_current_state(TASK_STOPPED);
- if (stop_count == 0)
- sig->flags = SIGNAL_STOP_STOPPED;
-
- spin_unlock_irq(&sighand->siglock);
- read_unlock(&tasklist_lock);
+ if (!t->exit_state &&
+ !(t->state & (TASK_STOPPED|TASK_TRACED))) {
+ stop_count++;
+ signal_wake_up(t, 0);
+ }
+ sig->group_stop_count = stop_count;
}
+ if (stop_count == 0)
+ sig->flags = SIGNAL_STOP_STOPPED;
+ current->exit_code = sig->group_exit_code;
+ __set_current_state(TASK_STOPPED);
+
+ spin_unlock_irq(&current->sighand->siglock);
finish_stop(stop_count);
return 1;
}
@@ -1922,6 +1734,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
sigset_t *mask = &current->blocked;
int signr = 0;
+ try_to_freeze();
+
relock:
spin_lock_irq(&current->sighand->siglock);
for (;;) {
@@ -1988,7 +1802,7 @@ relock:
continue;
/* Init gets no signals it doesn't want. */
- if (current->pid == 1)
+ if (current == child_reaper)
continue;
if (sig_kernel_stop(signr)) {
@@ -2099,10 +1913,11 @@ long do_no_restart_syscall(struct restart_block *param)
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
int error;
- sigset_t old_block;
spin_lock_irq(&current->sighand->siglock);
- old_block = current->blocked;
+ if (oldset)
+ *oldset = current->blocked;
+
error = 0;
switch (how) {
case SIG_BLOCK:
@@ -2119,8 +1934,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
}
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- if (oldset)
- *oldset = old_block;
+
return error;
}
@@ -2307,7 +2121,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
timeout = schedule_timeout_interruptible(timeout);
- try_to_freeze();
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &these, &info);
current->blocked = current->real_blocked;
@@ -2429,8 +2242,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
return kill_proc_info(sig, &info, pid);
}
-int
-do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct k_sigaction *k;
sigset_t mask;
@@ -2456,6 +2268,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
if (act) {
sigdelsetmask(&act->sa.sa_mask,
sigmask(SIGKILL) | sigmask(SIGSTOP));
+ *k = *act;
/*
* POSIX 3.3.1.3:
* "Setting a signal action to SIG_IGN for a signal that is
@@ -2468,19 +2281,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
* be discarded, whether or not it is blocked"
*/
if (act->sa.sa_handler == SIG_IGN ||
- (act->sa.sa_handler == SIG_DFL &&
- sig_kernel_ignore(sig))) {
- /*
- * This is a fairly rare case, so we only take the
- * tasklist_lock once we're sure we'll need it.
- * Now we must do this little unlock and relock
- * dance to maintain the lock hierarchy.
- */
+ (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
struct task_struct *t = current;
- spin_unlock_irq(&t->sighand->siglock);
- read_lock(&tasklist_lock);
- spin_lock_irq(&t->sighand->siglock);
- *k = *act;
sigemptyset(&mask);
sigaddset(&mask, sig);
rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2489,12 +2291,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
recalc_sigpending_tsk(t);
t = next_thread(t);
} while (t != current);
- spin_unlock_irq(&current->sighand->siglock);
- read_unlock(&tasklist_lock);
- return 0;
}
-
- *k = *act;
}
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded5..ec8fed42a86f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
+#include <linux/smp.h>
#include <asm/irq.h>
/*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
register_cpu_notifier(&cpu_nfb);
return 0;
}
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+ int ret = 0;
+
+ preempt_disable();
+ ret = smp_call_function(func, info, retry, wait);
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d52..ced91e1ff564 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
/*
* Detect Soft Lockups
*
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
*
* this code detects soft lockups: incidents in where on a CPU
* the kernel does not reschedule for 10 seconds or more.
*/
-
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/init.h>
@@ -17,13 +16,14 @@
static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
- void *ptr)
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
{
did_panic = 1;
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
void touch_softlockup_watchdog(void)
{
- per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+ per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
*/
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
{
int this_cpu = smp_processor_id();
- unsigned long timestamp = per_cpu(timestamp, this_cpu);
+ unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
- if (per_cpu(print_timestamp, this_cpu) == timestamp)
+ /* prevent double reports: */
+ if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+ did_panic ||
+ !per_cpu(watchdog_task, this_cpu))
return;
- /* Do not cause a second panic when there already was one */
- if (did_panic)
+ /* do not print during early bootup: */
+ if (unlikely(system_state != SYSTEM_RUNNING)) {
+ touch_softlockup_watchdog();
return;
+ }
- if (time_after(jiffies, timestamp + 10*HZ)) {
- per_cpu(print_timestamp, this_cpu) = timestamp;
+ /* Wake up the high-prio watchdog task every second: */
+ if (time_after(jiffies, touch_timestamp + HZ))
+ wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+ /* Warn about unreasonable 10+ seconds delays: */
+ if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+ per_cpu(print_timestamp, this_cpu) = touch_timestamp;
spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
this_cpu);
- show_regs(regs);
+ dump_stack();
spin_unlock(&print_lock);
}
}
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
- set_current_state(TASK_INTERRUPTIBLE);
-
/*
- * Run briefly once per second - if this gets delayed for
- * more than 10 seconds then the debug-printout triggers
- * in softlockup_tick():
+ * Run briefly once per second to reset the softlockup timestamp.
+ * If this gets delayed for more than 10 seconds then the
+ * debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- msleep_interruptible(1000);
+ set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
+ schedule();
}
- __set_current_state(TASK_RUNNING);
return 0;
}
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
+ per_cpu(touch_timestamp, hotcpu) = jiffies;
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
-
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void)
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
- notifier_chain_register(&panic_notifier_list, &panic_block);
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
}
-
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd5921d..d1b810782bc4 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
#define BUILD_LOCK_OPS(op, locktype) \
void __lockfunc _##op##_lock(locktype##_t *lock) \
{ \
- preempt_disable(); \
for (;;) { \
+ preempt_disable(); \
if (likely(_raw_##op##_trylock(lock))) \
break; \
preempt_enable(); \
+ \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
cpu_relax(); \
- preempt_disable(); \
} \
(lock)->break_lock = 0; \
} \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \
{ \
unsigned long flags; \
\
- preempt_disable(); \
for (;;) { \
+ preempt_disable(); \
local_irq_save(flags); \
if (likely(_raw_##op##_trylock(lock))) \
break; \
local_irq_restore(flags); \
- \
preempt_enable(); \
+ \
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
while (!op##_can_lock(lock) && (lock)->break_lock) \
cpu_relax(); \
- preempt_disable(); \
} \
(lock)->break_lock = 0; \
return flags; \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a5463e..7ef7f6054c28 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
* and the like.
*/
-static struct notifier_block *reboot_notifier_list;
-static DEFINE_RWLOCK(notifier_lock);
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ * Notifier chain core routines. The exported routines below
+ * are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if (n->priority > (*nl)->priority)
+ break;
+ nl = &((*nl)->next);
+ }
+ n->next = *nl;
+ rcu_assign_pointer(*nl, n);
+ return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if ((*nl) == n) {
+ rcu_assign_pointer(*nl, n->next);
+ return 0;
+ }
+ nl = &((*nl)->next);
+ }
+ return -ENOENT;
+}
+
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v)
+{
+ int ret = NOTIFY_DONE;
+ struct notifier_block *nb;
+
+ nb = rcu_dereference(*nl);
+ while (nb) {
+ ret = nb->notifier_call(nb, val, v);
+ if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ break;
+ nb = rcu_dereference(nb->next);
+ }
+ return ret;
+}
+
+/*
+ * Atomic notifier chain routines. Registration and unregistration
+ * use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
/**
- * notifier_chain_register - Add notifier to a notifier chain
- * @list: Pointer to root list pointer
+ * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
* @n: New entry in notifier chain
*
- * Adds a notifier to a notifier chain.
+ * Adds a notifier to an atomic notifier chain.
*
* Currently always returns zero.
*/
+
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an atomic notifier chain.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_unregister(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ synchronize_rcu();
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ * atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an atomic context, so they must not block.
+ * This routine uses RCU to synchronize with changes to the chain.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
- write_lock(&notifier_lock);
- while(*list)
- {
- if(n->priority > (*list)->priority)
- break;
- list= &((*list)->next);
- }
- n->next = *list;
- *list=n;
- write_unlock(&notifier_lock);
- return 0;
+ int ret;
+
+ rcu_read_lock();
+ ret = notifier_call_chain(&nh->head, val, v);
+ rcu_read_unlock();
+ return ret;
}
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ * Blocking notifier chain routines. All access to the chain is
+ * synchronized by an rwsem.
+ */
/**
- * notifier_chain_unregister - Remove notifier from a notifier chain
- * @nl: Pointer to root list pointer
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
* @n: New entry in notifier chain
*
- * Removes a notifier from a notifier chain.
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
*
- * Returns zero on success, or %-ENOENT on failure.
+ * Currently always returns zero.
*/
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
{
- write_lock(&notifier_lock);
- while((*nl)!=NULL)
- {
- if((*nl)==n)
- {
- *nl=n->next;
- write_unlock(&notifier_lock);
- return 0;
- }
- nl=&((*nl)->next);
- }
- write_unlock(&notifier_lock);
- return -ENOENT;
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_register(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
}
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
- * notifier_call_chain - Call functions in a notifier chain
- * @n: Pointer to root pointer of notifier chain
+ * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a blocking notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_unregister(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ * blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
- * Calls each function in a notifier chain in turn.
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
*
- * If the return value of the notifier can be and'd
- * with %NOTIFY_STOP_MASK, then notifier_call_chain
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
* will return immediately, with the return value of
* the notifier function which halted execution.
- * Otherwise, the return value is the return value
+ * Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
{
- int ret=NOTIFY_DONE;
- struct notifier_block *nb = *n;
+ int ret;
- while(nb)
- {
- ret=nb->notifier_call(nb,val,v);
- if(ret&NOTIFY_STOP_MASK)
- {
- return ret;
- }
- nb=nb->next;
- }
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain(&nh->head, val, v);
+ up_read(&nh->rwsem);
return ret;
}
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ * Raw notifier chain routines. There is no protection;
+ * the caller must provide it. Use at your own risk!
+ */
+
+/**
+ * raw_notifier_chain_register - Add notifier to a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Currently always returns zero.
+ */
+
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_register(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_unregister(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ * raw_notifier_call_chain - Call functions in a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an undefined context.
+ * All locking must be provided by the caller.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return notifier_call_chain(&nh->head, val, v);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
/**
* register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
* Registers a function with the list of functions
* to be called at reboot time.
*
- * Currently always returns zero, as notifier_chain_register
+ * Currently always returns zero, as blocking_notifier_chain_register
* always returns zero.
*/
int register_reboot_notifier(struct notifier_block * nb)
{
- return notifier_chain_register(&reboot_notifier_list, nb);
+ return blocking_notifier_chain_register(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier);
int unregister_reboot_notifier(struct notifier_block * nb)
{
- return notifier_chain_unregister(&reboot_notifier_list, nb);
+ return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
}
EXPORT_SYMBOL(unregister_reboot_notifier);
-#ifndef CONFIG_SECURITY
-int capable(int cap)
-{
- if (cap_raised(current->cap_effective, cap)) {
- current->flags |= PF_SUPERPRIV;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(capable);
-#endif
-
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
int no_nice;
@@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
void kernel_restart_prepare(char *cmd)
{
- notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+ blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
device_shutdown();
}
@@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
void kernel_shutdown_prepare(enum system_states state)
{
- notifier_call_chain(&reboot_notifier_list,
+ blocking_notifier_call_chain(&reboot_notifier_list,
(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
system_state = state;
device_shutdown();
@@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
*/
if (tbuf) {
struct tms tmp;
+ struct task_struct *tsk = current;
+ struct task_struct *t;
cputime_t utime, stime, cutime, cstime;
-#ifdef CONFIG_SMP
- if (thread_group_empty(current)) {
- /*
- * Single thread case without the use of any locks.
- *
- * We may race with release_task if two threads are
- * executing. However, release task first adds up the
- * counters (__exit_signal) before removing the task
- * from the process tasklist (__unhash_process).
- * __exit_signal also acquires and releases the
- * siglock which results in the proper memory ordering
- * so that the list modifications are always visible
- * after the counters have been updated.
- *
- * If the counters have been updated by the second thread
- * but the thread has not yet been removed from the list
- * then the other branch will be executing which will
- * block on tasklist_lock until the exit handling of the
- * other task is finished.
- *
- * This also implies that the sighand->siglock cannot
- * be held by another processor. So we can also
- * skip acquiring that lock.
- */
- utime = cputime_add(current->signal->utime, current->utime);
- stime = cputime_add(current->signal->utime, current->stime);
- cutime = current->signal->cutime;
- cstime = current->signal->cstime;
- } else
-#endif
- {
+ spin_lock_irq(&tsk->sighand->siglock);
+ utime = tsk->signal->utime;
+ stime = tsk->signal->stime;
+ t = tsk;
+ do {
+ utime = cputime_add(utime, t->utime);
+ stime = cputime_add(stime, t->stime);
+ t = next_thread(t);
+ } while (t != tsk);
- /* Process with multiple threads */
- struct task_struct *tsk = current;
- struct task_struct *t;
-
- read_lock(&tasklist_lock);
- utime = tsk->signal->utime;
- stime = tsk->signal->stime;
- t = tsk;
- do {
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
- t = next_thread(t);
- } while (t != tsk);
+ cutime = tsk->signal->cutime;
+ cstime = tsk->signal->cstime;
+ spin_unlock_irq(&tsk->sighand->siglock);
- /*
- * While we have tasklist_lock read-locked, no dying thread
- * can be updating current->signal->[us]time. Instead,
- * we got their counts included in the live thread loop.
- * However, another thread can come in right now and
- * do a wait call that updates current->signal->c[us]time.
- * To make sure we always see that pair updated atomically,
- * we take the siglock around fetching them.
- */
- spin_lock_irq(&tsk->sighand->siglock);
- cutime = tsk->signal->cutime;
- cstime = tsk->signal->cstime;
- spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
- }
tmp.tms_utime = cputime_to_clock_t(utime);
tmp.tms_stime = cputime_to_clock_t(stime);
tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1227,7 +1375,7 @@ asmlinkage long sys_setsid(void)
struct pid *pid;
int err = -EPERM;
- down(&tty_sem);
+ mutex_lock(&tty_mutex);
write_lock_irq(&tasklist_lock);
pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1389,7 @@ asmlinkage long sys_setsid(void)
err = process_group(group_leader);
out:
write_unlock_irq(&tasklist_lock);
- up(&tty_sem);
+ mutex_unlock(&tty_mutex);
return err;
}
@@ -1375,7 +1523,7 @@ static void groups_sort(struct group_info *group_info)
/* a simple bsearch */
int groups_search(struct group_info *group_info, gid_t grp)
{
- int left, right;
+ unsigned int left, right;
if (!group_info)
return 0;
@@ -1383,7 +1531,7 @@ int groups_search(struct group_info *group_info, gid_t grp)
left = 0;
right = group_info->ngroups;
while (left < right) {
- int mid = (left+right)/2;
+ unsigned int mid = (left+right)/2;
int cmp = grp - GROUP_AT(group_info, mid);
if (cmp > 0)
left = mid + 1;
@@ -1433,7 +1581,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
return -EINVAL;
/* no need to grab task_lock here; it cannot change */
- get_group_info(current->group_info);
i = current->group_info->ngroups;
if (gidsetsize) {
if (i > gidsetsize) {
@@ -1446,7 +1593,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
}
}
out:
- put_group_info(current->group_info);
return i;
}
@@ -1487,9 +1633,7 @@ int in_group_p(gid_t grp)
{
int retval = 1;
if (grp != current->fsgid) {
- get_group_info(current->group_info);
retval = groups_search(current->group_info, grp);
- put_group_info(current->group_info);
}
return retval;
}
@@ -1500,9 +1644,7 @@ int in_egroup_p(gid_t grp)
{
int retval = 1;
if (grp != current->egid) {
- get_group_info(current->group_info);
retval = groups_search(current->group_info, grp);
- put_group_info(current->group_info);
}
return retval;
}
@@ -1630,20 +1772,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
{
struct rlimit new_rlim, *old_rlim;
+ unsigned long it_prof_secs;
int retval;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
- if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
- return -EINVAL;
+ if (new_rlim.rlim_cur > new_rlim.rlim_max)
+ return -EINVAL;
old_rlim = current->signal->rlim + resource;
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
- return -EPERM;
+ return -EPERM;
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
@@ -1653,19 +1796,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
*old_rlim = new_rlim;
task_unlock(current->group_leader);
- if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
- (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
- new_rlim.rlim_cur <= cputime_to_secs(
- current->signal->it_prof_expires))) {
- cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
+ if (resource != RLIMIT_CPU)
+ goto out;
+
+ /*
+ * RLIMIT_CPU handling. Note that the kernel fails to return an error
+ * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
+ * very long-standing error, and fixing it now risks breakage of
+ * applications, so we live with it
+ */
+ if (new_rlim.rlim_cur == RLIM_INFINITY)
+ goto out;
+
+ it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
+ if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
+ unsigned long rlim_cur = new_rlim.rlim_cur;
+ cputime_t cputime;
+
+ if (rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ rlim_cur = 1;
+ }
+ cputime = secs_to_cputime(rlim_cur);
read_lock(&tasklist_lock);
spin_lock_irq(&current->sighand->siglock);
- set_process_cpu_timer(current, CPUCLOCK_PROF,
- &cputime, NULL);
+ set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
spin_unlock_irq(&current->sighand->siglock);
read_unlock(&tasklist_lock);
}
-
+out:
return 0;
}
@@ -1677,9 +1841,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
* a lot simpler! (Which we're not doing right now because we're not
* measuring them yet).
*
- * This expects to be called with tasklist_lock read-locked or better,
- * and the siglock not locked. It may momentarily take the siglock.
- *
* When sampling multiple threads for RUSAGE_SELF, under SMP we might have
* races with threads incrementing their own counters. But since word
* reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1848,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
* the c* fields from p->signal from races with exit.c updating those
* fields when reaping, so a sample either gets all the additions of a
* given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock. No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock. On the writer side,
+ * write memory barrier is implied in __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
*/
static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1874,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
cputime_t utime, stime;
+ int need_lock = 0;
memset((char *) r, 0, sizeof *r);
+ utime = stime = cputime_zero;
- if (unlikely(!p->signal))
- return;
+ if (p != current || !thread_group_empty(p))
+ need_lock = 1;
- utime = stime = cputime_zero;
+ if (need_lock) {
+ read_lock(&tasklist_lock);
+ if (unlikely(!p->signal)) {
+ read_unlock(&tasklist_lock);
+ return;
+ }
+ } else
+ /* See locking comments above */
+ smp_rmb();
switch (who) {
case RUSAGE_BOTH:
@@ -1740,6 +1930,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
BUG();
}
+ if (need_lock)
+ read_unlock(&tasklist_lock);
cputime_to_timeval(utime, &r->ru_utime);
cputime_to_timeval(stime, &r->ru_stime);
}
@@ -1747,9 +1939,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
- read_lock(&tasklist_lock);
k_getrusage(p, who, &r);
- read_unlock(&tasklist_lock);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
cond_syscall(sys_socketcall);
cond_syscall(sys_futex);
cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
cond_syscall(sys_epoll_create);
cond_syscall(sys_epoll_ctl);
cond_syscall(sys_epoll_wait);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32b48e8ee36e..e82726faeeff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -742,18 +742,18 @@ static ctl_table vm_table[] = {
{
.ctl_name = VM_DIRTY_WB_CS,
.procname = "dirty_writeback_centisecs",
- .data = &dirty_writeback_centisecs,
- .maxlen = sizeof(dirty_writeback_centisecs),
+ .data = &dirty_writeback_interval,
+ .maxlen = sizeof(dirty_writeback_interval),
.mode = 0644,
.proc_handler = &dirty_writeback_centisecs_handler,
},
{
.ctl_name = VM_DIRTY_EXPIRE_CS,
.procname = "dirty_expire_centisecs",
- .data = &dirty_expire_centisecs,
- .maxlen = sizeof(dirty_expire_centisecs),
+ .data = &dirty_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_userhz_jiffies,
},
{
.ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -848,9 +848,8 @@ static ctl_table vm_table[] = {
.data = &laptop_mode,
.maxlen = sizeof(laptop_mode),
.mode = 0644,
- .proc_handler = &proc_dointvec,
- .strategy = &sysctl_intvec,
- .extra1 = &zero,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
},
{
.ctl_name = VM_BLOCK_DUMP,
@@ -2054,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
+ if (*lvalp > LONG_MAX / HZ)
+ return 1;
*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
} else {
int val = *valp;
@@ -2075,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
int write, void *data)
{
if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
*valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
} else {
int val = *valp;
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d8b..ff8e7019c4c4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
-long pps_offset; /* pps time offset (us) */
-long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */
-
-long pps_freq; /* frequency offset (scaled ppm) */
-long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
-
-long pps_valid = PPS_VALID; /* pps signal watchdog counter */
-
-int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
-
-long pps_jitcnt; /* jitter limit exceeded */
-long pps_calcnt; /* calibration intervals */
-long pps_errcnt; /* calibration errors */
-long pps_stbcnt; /* stability limit exceeded */
-
-/* hook for a loadable hardpps kernel module */
-void (*hardpps_ptr)(struct timeval *);
-
/* we call this to notify the arch when the clock is being
* controlled. If no such arch routine, do nothing.
*/
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
result = -EINVAL;
goto leave;
}
- time_freq = txc->freq - pps_freq;
+ time_freq = txc->freq;
}
if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
if ((time_next_adjust = txc->offset) == 0)
time_adjust = 0;
}
- else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
- ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
- (STA_PPSTIME | STA_PPSSIGNAL) ?
- pps_offset : txc->offset;
+ else if (time_status & STA_PLL) {
+ ltemp = txc->offset;
/*
* Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
}
time_freq = min(time_freq, time_tolerance);
time_freq = max(time_freq, -time_tolerance);
- } /* STA_PLL || STA_PPSTIME */
+ } /* STA_PLL */
} /* txc->modes & ADJ_OFFSET */
if (txc->modes & ADJ_TICK) {
tick_usec = txc->tick;
tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
}
} /* txc->modes */
-leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
- || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
- && (time_status & STA_PPSSIGNAL) == 0)
- /* p. 24, (b) */
- || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
- == (STA_PPSTIME|STA_PPSJITTER))
- /* p. 24, (c) */
- || ((time_status & STA_PPSFREQ) != 0
- && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
- /* p. 24, (d) */
+leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
result = TIME_ERROR;
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
else {
txc->offset = shift_right(time_offset, SHIFT_UPDATE);
}
- txc->freq = time_freq + pps_freq;
+ txc->freq = time_freq;
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
@@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
txc->precision = time_precision;
txc->tolerance = time_tolerance;
txc->tick = tick_usec;
- txc->ppsfreq = pps_freq;
- txc->jitter = pps_jitter >> PPS_AVG;
- txc->shift = pps_shift;
- txc->stabil = pps_stabil;
- txc->jitcnt = pps_jitcnt;
- txc->calcnt = pps_calcnt;
- txc->errcnt = pps_errcnt;
- txc->stbcnt = pps_stbcnt;
+
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
write_sequnlock_irq(&xtime_lock);
do_gettimeofday(&txc->time);
notify_arch_cmos_timer();
@@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
*
* Returns the timespec representation of the nsec parameter.
*/
-struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const s64 nsec)
{
struct timespec ts;
@@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
*
* Returns the timeval representation of the nsec parameter.
*/
-struct timeval ns_to_timeval(const nsec_t nsec)
+struct timeval ns_to_timeval(const s64 nsec)
{
struct timespec ts = ns_to_timespec(nsec);
struct timeval tv;
diff --git a/kernel/timer.c b/kernel/timer.c
index 2410c18dbeb1..ab189dd187cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -86,7 +86,8 @@ struct tvec_t_base_s {
} ____cacheline_aligned_in_smp;
typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+static tvec_base_t boot_tvec_bases;
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
@@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base);
void fastcall init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
- timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+ timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
}
EXPORT_SYMBOL(init_timer);
@@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
ret = 1;
}
- new_base = &__get_cpu_var(tvec_bases);
+ new_base = __get_cpu_var(tvec_bases);
if (base != &new_base->t_base) {
/*
@@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+ tvec_base_t *base = per_cpu(tvec_bases, cpu);
unsigned long flags;
BUG_ON(timer_pending(timer) || !timer->function);
@@ -504,7 +505,7 @@ unsigned long next_timer_interrupt(void)
}
hr_expires += jiffies;
- base = &__get_cpu_var(tvec_bases);
+ base = __get_cpu_var(tvec_bases);
spin_lock(&base->t_base.lock);
expires = base->timer_jiffies + (LONG_MAX >> 1);
list = NULL;
@@ -696,18 +697,9 @@ static void second_overflow(void)
/*
* Compute the frequency estimate and additional phase adjustment due
- * to frequency error for the next second. When the PPS signal is
- * engaged, gnaw on the watchdog counter and update the frequency
- * computed by the pll and the PPS signal.
+ * to frequency error for the next second.
*/
- pps_valid++;
- if (pps_valid == PPS_VALID) { /* PPS signal lost */
- pps_jitter = MAXTIME;
- pps_stabil = MAXFREQ;
- time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
- STA_PPSWANDER | STA_PPSERROR);
- }
- ltemp = time_freq + pps_freq;
+ ltemp = time_freq;
time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
#if HZ == 100
@@ -901,7 +893,7 @@ EXPORT_SYMBOL(xtime_lock);
*/
static void run_timer_softirq(struct softirq_action *h)
{
- tvec_base_t *base = &__get_cpu_var(tvec_bases);
+ tvec_base_t *base = __get_cpu_var(tvec_bases);
hrtimer_run_queues();
if (time_after_eq(jiffies, base->timer_jiffies))
@@ -914,6 +906,7 @@ static void run_timer_softirq(struct softirq_action *h)
void run_local_timers(void)
{
raise_softirq(TIMER_SOFTIRQ);
+ softlockup_tick();
}
/*
@@ -944,7 +937,6 @@ void do_timer(struct pt_regs *regs)
/* prevent loading jiffies before storing new jiffies_64 value. */
barrier();
update_times();
- softlockup_tick(regs);
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -955,19 +947,7 @@ void do_timer(struct pt_regs *regs)
*/
asmlinkage unsigned long sys_alarm(unsigned int seconds)
{
- struct itimerval it_new, it_old;
- unsigned int oldalarm;
-
- it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
- it_new.it_value.tv_sec = seconds;
- it_new.it_value.tv_usec = 0;
- do_setitimer(ITIMER_REAL, &it_new, &it_old);
- oldalarm = it_old.it_value.tv_sec;
- /* ehhh.. We can't return 0 if we have an alarm pending.. */
- /* And we'd better return too much than too little anyway */
- if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
- oldalarm++;
- return oldalarm;
+ return alarm_setitimer(seconds);
}
#endif
@@ -1256,12 +1236,32 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
return 0;
}
-static void __devinit init_timers_cpu(int cpu)
+static int __devinit init_timers_cpu(int cpu)
{
int j;
tvec_base_t *base;
- base = &per_cpu(tvec_bases, cpu);
+ base = per_cpu(tvec_bases, cpu);
+ if (!base) {
+ static char boot_done;
+
+ /*
+ * Cannot do allocation in init_timers as that runs before the
+ * allocator initializes (and would waste memory if there are
+ * more possible CPUs than will ever be installed/brought up).
+ */
+ if (boot_done) {
+ base = kmalloc_node(sizeof(*base), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!base)
+ return -ENOMEM;
+ memset(base, 0, sizeof(*base));
+ } else {
+ base = &boot_tvec_bases;
+ boot_done = 1;
+ }
+ per_cpu(tvec_bases, cpu) = base;
+ }
spin_lock_init(&base->t_base.lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1273,6 +1273,7 @@ static void __devinit init_timers_cpu(int cpu)
INIT_LIST_HEAD(base->tv1.vec + j);
base->timer_jiffies = jiffies;
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1295,8 +1296,8 @@ static void __devinit migrate_timers(int cpu)
int i;
BUG_ON(cpu_online(cpu));
- old_base = &per_cpu(tvec_bases, cpu);
- new_base = &get_cpu_var(tvec_bases);
+ old_base = per_cpu(tvec_bases, cpu);
+ new_base = get_cpu_var(tvec_bases);
local_irq_disable();
spin_lock(&new_base->t_base.lock);
@@ -1326,7 +1327,8 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
long cpu = (long)hcpu;
switch(action) {
case CPU_UP_PREPARE:
- init_timers_cpu(cpu);
+ if (init_timers_cpu(cpu) < 0)
+ return NOTIFY_BAD;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
diff --git a/kernel/user.c b/kernel/user.c
index d9deae43a9ab..2116642f42c6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up)
{
unsigned long flags;
+ if (!up)
+ return;
+
local_irq_save(flags);
- if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
uid_hash_remove(up);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
key_put(up->uid_keyring);
key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
- spin_unlock(&uidhash_lock);
+ } else {
+ local_irq_restore(flags);
}
- local_irq_restore(flags);
}
struct user_struct * alloc_uid(uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c710..e9e464a90376 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
+#include <linux/hardirq.h>
/*
* The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
}
EXPORT_SYMBOL(cancel_rearming_delayed_work);
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn: the function to execute
+ * @data: data to pass to the function
+ * @ew: guaranteed storage for the execute work structure (must
+ * be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns: 0 - function was executed
+ * 1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+ struct execute_work *ew)
+{
+ if (!in_interrupt()) {
+ fn(data);
+ return 0;
+ }
+
+ INIT_WORK(&ew->work, fn, data);
+ schedule_work(&ew->work);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
int keventd_up(void)
{
return keventd_wq != NULL;