From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 67 ++++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 39 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cfc5295f1e82..ec88fcc9a0d2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -149,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -379,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -2228,49 +2227,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { -- cgit v1.2.3 From f221e726bf4e082a05dcd573379ac859bfba7126 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Oct 2008 22:04:23 -0700 Subject: sysctl: simplify ->strategy name and nlen parameters passed to ->strategy hook are unused, remove them. In general ->strategy hook should know what it's doing, and don't do something tricky for which, say, pointer to original userspace array may be needed (name). Signed-off-by: Alexey Dobriyan Acked-by: David S. Miller [ networking bits ] Cc: Ralf Baechle Cc: David Howells Cc: Matt Mackall Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/kernel/pm.c | 6 +++--- arch/mips/lasat/sysctl.c | 17 ++++++++--------- drivers/char/random.c | 2 +- include/linux/sysctl.h | 2 +- include/net/ip.h | 2 +- include/net/ndisc.h | 5 ++--- ipc/ipc_sysctl.c | 9 ++++----- kernel/sysctl.c | 29 +++++++++++++---------------- kernel/utsname_sysctl.c | 5 ++--- net/decnet/dn_dev.c | 4 ++-- net/decnet/sysctl_net_decnet.c | 4 ++-- net/ipv4/devinet.c | 7 +++---- net/ipv4/route.c | 7 +------ net/ipv4/sysctl_net_ipv4.c | 18 +++++++++--------- net/ipv6/addrconf.c | 1 - net/ipv6/ndisc.c | 11 ++++------- 16 files changed, 56 insertions(+), 73 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c index d1113c5031f5..be722fc1acff 100644 --- a/arch/frv/kernel/pm.c +++ b/arch/frv/kernel/pm.c @@ -211,7 +211,7 @@ static int cmode_procctl(ctl_table *ctl, int write, struct file *filp, return try_set_cmode(new_cmode)?:*lenp; } -static int cmode_sysctl(ctl_table *table, int __user *name, int nlen, +static int cmode_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -314,7 +314,7 @@ static int p0_procctl(ctl_table *ctl, int write, struct file *filp, return try_set_p0(new_p0)?:*lenp; } -static int p0_sysctl(ctl_table *table, int __user *name, int nlen, +static int p0_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -358,7 +358,7 @@ static int cm_procctl(ctl_table *ctl, int write, struct file *filp, return try_set_cm(new_cm)?:*lenp; } -static int cm_sysctl(ctl_table *table, int __user *name, int nlen, +static int cm_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 866881ec0cf8..8f88886feb12 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -38,14 +38,13 @@ #endif /* Strategy function to write EEPROM after changing string entry */ -int sysctl_lasatstring(ctl_table *table, int *name, int nlen, +int sysctl_lasatstring(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { int r; - r = sysctl_string(table, name, - nlen, oldval, oldlenp, newval, newlen); + r = sysctl_string(table, oldval, oldlenp, newval, newlen); if (r < 0) return r; @@ -113,13 +112,13 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, #endif /* Sysctl for setting the IP addresses */ -int sysctl_lasat_intvec(ctl_table *table, int *name, int nlen, +int sysctl_lasat_intvec(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { int r; - r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen); + r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); if (r < 0) return r; @@ -131,7 +130,7 @@ int sysctl_lasat_intvec(ctl_table *table, int *name, int nlen, #ifdef CONFIG_DS1603 /* Same for RTC */ -int sysctl_lasat_rtc(ctl_table *table, int *name, int nlen, +int sysctl_lasat_rtc(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { @@ -140,7 +139,7 @@ int sysctl_lasat_rtc(ctl_table *table, int *name, int nlen, rtctmp = read_persistent_clock(); if (rtctmp < 0) rtctmp = 0; - r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen); + r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); if (r < 0) return r; if (newval && newlen) @@ -211,13 +210,13 @@ int proc_lasat_ip(ctl_table *table, int write, struct file *filp, } #endif -static int sysctl_lasat_prid(ctl_table *table, int *name, int nlen, +static int sysctl_lasat_prid(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { int r; - r = sysctl_intvec(table, name, nlen, oldval, oldlenp, newval, newlen); + r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); if (r < 0) return r; if (newval && newlen) { diff --git a/drivers/char/random.c b/drivers/char/random.c index 6af435b89867..c8752eaad483 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1205,7 +1205,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp, return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos); } -static int uuid_strategy(ctl_table *table, int __user *name, int nlen, +static int uuid_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d0437f36921f..39d471d1163b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -972,7 +972,7 @@ extern int sysctl_perm(struct ctl_table_root *root, typedef struct ctl_table ctl_table; -typedef int ctl_handler (struct ctl_table *table, int __user *name, int nlen, +typedef int ctl_handler (struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); diff --git a/include/net/ip.h b/include/net/ip.h index 1cbccaf0de3f..bc026ecb513f 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -396,7 +396,7 @@ extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, int ipv4_doint_and_flush(ctl_table *ctl, int write, struct file* filp, void __user *buffer, size_t *lenp, loff_t *ppos); -int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, +int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); #ifdef CONFIG_PROC_FS diff --git a/include/net/ndisc.h b/include/net/ndisc.h index a01b7c4dc763..11dd0137c6a5 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -129,9 +129,8 @@ extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, void __user *buffer, size_t *lenp, loff_t *ppos); -int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, - int nlen, void __user *oldval, - size_t __user *oldlenp, +int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); #endif diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 69bc85978ba0..0dfebc509426 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -131,7 +131,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic sysctl ipc data routine. */ -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, +static int sysctl_ipc_data(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -169,14 +169,13 @@ static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, return 1; } -static int sysctl_ipc_registered_data(ctl_table *table, int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, +static int sysctl_ipc_registered_data(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int rc; - rc = sysctl_ipc_data(table, name, nlen, oldval, oldlenp, newval, - newlen); + rc = sysctl_ipc_data(table, oldval, oldlenp, newval, newlen); if (newval && newlen && rc > 0) /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ec88fcc9a0d2..9792c315676a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1500,7 +1500,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1514,8 +1513,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1525,8 +1523,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1558,7 +1555,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2707,7 +2704,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2741,7 +2738,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2787,7 +2784,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2823,7 +2820,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2857,7 +2854,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2912,35 +2909,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659d269e..3b34b3545936 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 2f0ac3c3eb71..96f9fcef2958 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -166,7 +166,7 @@ static int max_priority[] = { 127 }; /* From DECnet spec */ static int dn_forwarding_proc(ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, +static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); @@ -318,7 +318,7 @@ static int dn_forwarding_proc(ctl_table *table, int write, #endif } -static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, +static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 228067c571ba..36400b266896 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -132,7 +132,7 @@ static int parse_addr(__le16 *addr, char *str) } -static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen, +static int dn_node_address_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -217,7 +217,7 @@ static int dn_node_address_handler(ctl_table *table, int write, } -static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen, +static int dn_def_dev_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b12dae2b0b2d..5154e729cf16 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1283,7 +1283,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, +static int devinet_conf_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1379,12 +1379,11 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, return ret; } -int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, +int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, - newval, newlen); + int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen); struct net *net = table->extra2; if (ret == 1) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a6d7c584f53b..942be04e7955 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2908,8 +2908,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, } static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, @@ -2972,16 +2970,13 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, } static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int old = ip_rt_secret_interval; - int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval, - newlen); + int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); rt_secret_reschedule(old); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 276d047fb85a..1bb10df8ce7d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -64,8 +64,8 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, } /* Validate changes from sysctl interface. */ -static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int ipv4_sysctl_local_port_range(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -80,7 +80,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, }; inet_get_local_port_range(range, range + 1); - ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen); if (ret == 0 && newval && newlen) { if (range[1] < range[0]) ret = -EINVAL; @@ -109,8 +109,8 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int sysctl_tcp_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -122,7 +122,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int ret; tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_default_congestion_control(val); return ret; @@ -165,8 +165,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int strategy_allowed_congestion_control(ctl_table *table, int __user *name, - int nlen, void __user *oldval, +static int strategy_allowed_congestion_control(ctl_table *table, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) @@ -179,7 +179,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam return -ENOMEM; tcp_get_available_congestion_control(tbl.data, tbl.maxlen); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen); + ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); if (ret == 1 && newval && newlen) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7b6a584b62dd..eea9542728ca 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3982,7 +3982,6 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, } static int addrconf_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 840b15780a36..7f39e9b36456 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1730,9 +1730,8 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f return ret; } -int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, - int nlen, void __user *oldval, - size_t __user *oldlenp, +int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, + void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { struct net_device *dev = ctl->extra1; @@ -1745,13 +1744,11 @@ int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, switch (ctl->ctl_name) { case NET_NEIGH_REACHABLE_TIME: - ret = sysctl_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); + ret = sysctl_jiffies(ctl, oldval, oldlenp, newval, newlen); break; case NET_NEIGH_RETRANS_TIME_MS: case NET_NEIGH_REACHABLE_TIME_MS: - ret = sysctl_ms_jiffies(ctl, name, nlen, - oldval, oldlenp, newval, newlen); + ret = sysctl_ms_jiffies(ctl, oldval, oldlenp, newval, newlen); break; default: ret = 0; -- cgit v1.2.3 From ebf3f09c634906d371f2bfd71b41c7e0c52efe7e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 15 Oct 2008 22:05:12 -0700 Subject: Configure out AIO support This patchs adds the CONFIG_AIO option which allows to remove support for asynchronous I/O operations, that are not necessarly used by applications, particularly on embedded devices. As this is a size-reduction option, it depends on CONFIG_EMBEDDED. It allows to save ~7 kilobytes of kernel code/data: text data bss dec hex filename 1115067 119180 217088 1451335 162547 vmlinux 1108025 119048 217088 1444161 160941 vmlinux.new -7042 -132 0 -7174 -1C06 +/- This patch has been originally written by Matt Mackall , and is part of the Linux Tiny project. [randy.dunlap@oracle.com: build fix] Signed-off-by: Thomas Petazzoni Cc: Benjamin LaHaise Cc: Zach Brown Signed-off-by: Matt Mackall Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 3 ++- include/linux/aio.h | 9 +++++++++ init/Kconfig | 8 ++++++++ kernel/sys_ni.c | 5 +++++ kernel/sysctl.c | 2 ++ 5 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/fs/Makefile b/fs/Makefile index b6f27dc26b72..d0c69f57e5bf 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -8,7 +8,7 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ - attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ stack.o @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o diff --git a/include/linux/aio.h b/include/linux/aio.h index 09b276c35227..f6b8cf99b596 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -204,12 +204,21 @@ struct kioctx { /* prototypes */ extern unsigned aio_max_size; +#ifdef CONFIG_AIO extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); extern int aio_put_req(struct kiocb *iocb); extern void kick_iocb(struct kiocb *iocb); extern int aio_complete(struct kiocb *iocb, long res, long res2); struct mm_struct; extern void exit_aio(struct mm_struct *mm); +#else +static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } +static inline int aio_put_req(struct kiocb *iocb) { return 0; } +static inline void kick_iocb(struct kiocb *iocb) { } +static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; } +struct mm_struct; +static inline void exit_aio(struct mm_struct *mm) { } +#endif /* CONFIG_AIO */ #define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait) diff --git a/init/Kconfig b/init/Kconfig index 8a8e2d00c40e..5ceff3249a2d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -713,6 +713,14 @@ config SHMEM option replaces shmem and tmpfs with the much simpler ramfs code, which may be appropriate on small systems without swap. +config AIO + bool "Enable AIO support" if EMBEDDED + default y + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling + this option saves about 7k. + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EMBEDDED diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 503d8d4eb80a..a77b27b11b04 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,11 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9792c315676a..617d41e4d6a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1281,6 +1281,7 @@ static struct ctl_table fs_table[] = { .extra2 = &two, }, #endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1295,6 +1296,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, -- cgit v1.2.3 From af936a1606246a10c145feac3770f6287f483f02 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 18 Oct 2008 20:26:53 -0700 Subject: vmscan: unevictable LRU scan sysctl This patch adds a function to scan individual or all zones' unevictable lists and move any pages that have become evictable onto the respective zone's inactive list, where shrink_inactive_list() will deal with them. Adds sysctl to scan all nodes, and per node attributes to individual nodes' zones. Kosaki: If evictable page found in unevictable lru when write /proc/sys/vm/scan_unevictable_pages, print filename and file offset of these pages. [akpm@linux-foundation.org: fix one CONFIG_MMU=n build error] [kosaki.motohiro@jp.fujitsu.com: adapt vmscan-unevictable-lru-scan-sysctl.patch to new sysfs API] Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 5 ++ include/linux/rmap.h | 3 + include/linux/swap.h | 15 +++++ kernel/sysctl.c | 10 ++++ mm/rmap.c | 4 +- mm/vmscan.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 201 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index fb45d88a2446..f5207090885a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -13,6 +13,7 @@ #include #include #include +#include static struct sysdev_class node_class = { .name = "node", @@ -191,6 +192,8 @@ int register_node(struct node *node, int num, struct node *parent) sysdev_create_file(&node->sysdev, &attr_meminfo); sysdev_create_file(&node->sysdev, &attr_numastat); sysdev_create_file(&node->sysdev, &attr_distance); + + scan_unevictable_register_node(node); } return error; } @@ -210,6 +213,8 @@ void unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_numastat); sysdev_remove_file(&node->sysdev, &attr_distance); + scan_unevictable_unregister_node(node); + sysdev_unregister(&node->sysdev); } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 955667e6a52d..1da48db8db09 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -75,6 +75,9 @@ void anon_vma_unlink(struct vm_area_struct *); void anon_vma_link(struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); +extern struct anon_vma *page_lock_anon_vma(struct page *page); +extern void page_unlock_anon_vma(struct anon_vma *anon_vma); + /* * rmap interfaces called when adding or removing pte of page */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 07eda69412fb..a3af95b2cb6d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -235,15 +236,29 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) #ifdef CONFIG_UNEVICTABLE_LRU extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); + +extern unsigned long scan_unevictable_pages; +extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int scan_unevictable_register_node(struct node *node); +extern void scan_unevictable_unregister_node(struct node *node); #else static inline int page_evictable(struct page *page, struct vm_area_struct *vma) { return 1; } + static inline void scan_mapping_unevictable_pages(struct address_space *mapping) { } + +static inline int scan_unevictable_register_node(struct node *node) +{ + return 0; +} + +static inline void scan_unevictable_unregister_node(struct node *node) { } #endif extern int kswapd_run(int nid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a0..b3cc73931d1f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/mm/rmap.c b/mm/rmap.c index 7e60df99018e..7e90bebbeb6c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -181,7 +181,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -201,7 +201,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); diff --git a/mm/vmscan.c b/mm/vmscan.c index e5aaaad159ef..ca64e3e0c518 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2363,6 +2364,39 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } +static void show_page_path(struct page *page) +{ + char buf[256]; + if (page_is_file_cache(page)) { + struct address_space *mapping = page->mapping; + struct dentry *dentry; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + spin_lock(&mapping->i_mmap_lock); + dentry = d_find_alias(mapping->host); + printk(KERN_INFO "rescued: %s %lu\n", + dentry_path(dentry, buf, 256), pgoff); + spin_unlock(&mapping->i_mmap_lock); + } else { +#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + printk(KERN_INFO "rescued: anon %s\n", + vma->vm_mm->owner->comm); + break; + } + page_unlock_anon_vma(anon_vma); +#endif + } +} + + /** * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list * @page: page to check evictability and move to appropriate lru list @@ -2382,6 +2416,9 @@ retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + + show_page_path(page); + __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); __inc_zone_state(zone, NR_INACTIVE_ANON + l); @@ -2451,4 +2488,133 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) } } + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } + spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; + } +} + + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +void scan_all_zones_unevictable_pages(void) +{ + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + + scan_unevictable_pages = 0; + return 0; +} + +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } + return 1; +} + + +static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +} + #endif -- cgit v1.2.3