From 7053aee26a3548ebaba046ae2e52396ccf56ac6c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2014 15:48:14 -0800 Subject: fsnotify: do not share events between notification groups Currently fsnotify framework creates one event structure for each notification event and links this event into all interested notification groups. This is done so that we save memory when several notification groups are interested in the event. However the need for event structure shared between inotify & fanotify bloats the event structure so the result is often higher memory consumption. Another problem is that fsnotify framework keeps path references with outstanding events so that fanotify can return open file descriptors with its events. This has the undesirable effect that filesystem cannot be unmounted while there are outstanding events - a regression for inotify compared to a situation before it was converted to fsnotify framework. For fanotify this problem is hard to avoid and users of fanotify should kind of expect this behavior when they ask for file descriptors from notified files. This patch changes fsnotify and its users to create separate event structure for each group. This allows for much simpler code (~400 lines removed by this patch) and also smaller event structures. For example on 64-bit system original struct fsnotify_event consumes 120 bytes, plus additional space for file name, additional 24 bytes for second and each subsequent group linking the event, and additional 32 bytes for each inotify group for private data. After the conversion inotify event consumes 48 bytes plus space for file name which is considerably less memory unless file names are long and there are several groups interested in the events (both of which are uncommon). Fanotify event fits in 56 bytes after the conversion (fanotify doesn't care about file names so its events don't have to have it allocated). A win unless there are four or more fanotify groups interested in the event. The conversion also solves the problem with unmount when only inotify is used as we don't have to grab path references for inotify events. [hughd@google.com: fanotify: fix corruption preventing startup] Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Eric Paris Cc: Al Viro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 8 +++++--- kernel/audit_watch.c | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..bcc0b1821227 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,9 +912,11 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { BUG(); return -EOPNOTSUPP; @@ -945,7 +947,7 @@ static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, .should_send_event = audit_tree_send_event, .free_group_priv = NULL, - .free_event_priv = NULL, + .free_event = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..a760c32cb639 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -475,25 +475,25 @@ static bool audit_watch_should_send_event(struct fsnotify_group *group, struct i /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -516,7 +516,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = { .handle_event = audit_watch_handle_event, .free_group_priv = NULL, .freeing_mark = NULL, - .free_event_priv = NULL, + .free_event = NULL, }; static int __init audit_watch_init(void) -- cgit v1.2.3 From 83c4c4b0a3aadc1ce7b5b2870ce1fc1f65498da0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2014 15:48:15 -0800 Subject: fsnotify: remove .should_send_event callback After removing event structure creation from the generic layer there is no reason for separate .should_send_event and .handle_event callbacks. So just remove the first one. Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 22 ++++------------------ fs/notify/fanotify/fanotify.c | 18 ++++++++++-------- fs/notify/fsnotify.c | 5 ----- fs/notify/inotify/inotify_fsnotify.c | 24 +++++++----------------- include/linux/fsnotify_backend.h | 4 ---- kernel/audit_tree.c | 12 +----------- kernel/audit_watch.c | 9 --------- 7 files changed, 22 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index bfca53dbbf34..928688e3ee2f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -94,6 +94,10 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fown_struct *fown; __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -121,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -151,7 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, .free_group_priv = NULL, .freeing_mark = NULL, .free_event = NULL, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c26268d7bd9d..1f8f05220f8d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -88,18 +88,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p inode=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, inode, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -163,6 +162,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, mask); @@ -225,7 +228,6 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, .freeing_mark = NULL, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7c754c91c3f6..1d4e1ea2f37c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -177,11 +177,6 @@ static int send_to_group(struct inode *to_tell, if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, file_name); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 6fabbd163d16..aad1a35e9af1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -81,6 +81,13 @@ int inotify_handle_event(struct fsnotify_group *group, BUG_ON(vfsmount_mark); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } if (file_name) { len = strlen(file_name); alloc_len += len + 1; @@ -122,22 +129,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -189,7 +180,6 @@ static void inotify_free_event(struct fsnotify_event *fsn_event) const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7f3d7dcfcd00..7d8d5e608594 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -94,10 +94,6 @@ struct fsnotify_fname; * userspace messages that marks have been removed. */ struct fsnotify_ops { - bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type); int (*handle_event)(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index bcc0b1821227..ae8103b057fa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -918,8 +918,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, u32 mask, void *data, int data_type, const unsigned char *file_name) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -935,17 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, .free_group_priv = NULL, .free_event = NULL, .freeing_mark = audit_tree_freeing_mark, diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index a760c32cb639..367ac9a79acc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,14 +465,6 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, @@ -512,7 +504,6 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, .free_group_priv = NULL, .freeing_mark = NULL, -- cgit v1.2.3 From 56b27cf6030dd36c56a5542ab8bfa406d337f083 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2014 15:48:16 -0800 Subject: fsnotify: remove pointless NULL initializers We usually rely on the fact that struct members not specified in the initializer are set to NULL. So do that with fsnotify function pointers as well. Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 3 --- fs/notify/fanotify/fanotify.c | 1 - kernel/audit_tree.c | 2 -- kernel/audit_watch.c | 3 --- 4 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 928688e3ee2f..0b9ff4395e6a 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -138,9 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 1f8f05220f8d..58772623f02a 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -230,5 +230,4 @@ const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, - .freeing_mark = NULL, }; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ae8103b057fa..67ccf0e7cca9 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -936,8 +936,6 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .free_group_priv = NULL, - .free_event = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 367ac9a79acc..2596fac5dcb4 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -505,9 +505,6 @@ static int audit_watch_handle_event(struct fsnotify_group *group, static const struct fsnotify_ops audit_watch_fsnotify_ops = { .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event = NULL, }; static int __init audit_watch_init(void) -- cgit v1.2.3 From 49f0ce5f92321cdcf741e35f385669a421013cb7 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 21 Jan 2014 15:49:14 -0800 Subject: mm: add overcommit_kbytes sysctl variable Some applications that run on HPC clusters are designed around the availability of RAM and the overcommit ratio is fine tuned to get the maximum usage of memory without swapping. With growing memory, the 1%-of-all-RAM grain provided by overcommit_ratio has become too coarse for these workload (on a 2TB machine it represents no less than 20GB). This patch adds the new overcommit_kbytes sysctl variable that allow a much finer grain. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ Documentation/vm/overcommit-accounting | 7 ++++--- include/linux/mm.h | 9 +++++++++ include/linux/mman.h | 1 + kernel/sysctl.c | 11 ++++++++--- mm/mmap.c | 1 + mm/nommu.c | 1 + mm/util.c | 36 ++++++++++++++++++++++++++++++++-- 8 files changed, 70 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 1fbd4eb7b64a..9f5481bdc5a4 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -47,6 +47,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - oom_dump_tasks - oom_kill_allocating_task +- overcommit_kbytes - overcommit_memory - overcommit_ratio - page-cluster @@ -574,6 +575,17 @@ The default value is 0. ============================================================== +overcommit_kbytes: + +When overcommit_memory is set to 2, the committed address space is not +permitted to exceed swap plus this amount of physical RAM. See below. + +Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one +of them may be specified at a time. Setting one disables the other (which +then appears as 0 when read). + +============================================================== + overcommit_memory: This value contains a flag that enables memory overcommitment. diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 8eaa2fc4b8fa..cbfaaa674118 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting @@ -14,8 +14,8 @@ The Linux kernel supports the following overcommit handling modes 2 - Don't overcommit. The total address space commit for the system is not permitted to exceed swap + a - configurable percentage (default is 50) of physical RAM. - Depending on the percentage you use, in most situations + configurable amount (default is 50%) of physical RAM. + Depending on the amount you use, in most situations this means a process will not be killed while accessing pages but will receive errors on memory allocation as appropriate. @@ -26,7 +26,8 @@ The Linux kernel supports the following overcommit handling modes The overcommit policy is set via the sysctl `vm.overcommit_memory'. -The overcommit percentage is set via `vm.overcommit_ratio'. +The overcommit amount can be set via `vm.overcommit_ratio' (percentage) +or `vm.overcommit_kbytes' (absolute value). The current overcommit limit and amount committed are viewable in /proc/meminfo as CommitLimit and Committed_AS respectively. diff --git a/include/linux/mm.h b/include/linux/mm.h index 4c0c01afc19b..a512dd836931 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -57,6 +57,15 @@ extern int sysctl_legacy_va_layout; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; + +extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); +extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 7f7f8dae4b1d..16373c8f5f57 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -9,6 +9,7 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..332cefcdb04b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,8 +95,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -1121,7 +1119,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", diff --git a/mm/mmap.c b/mm/mmap.c index 834b2d785f1e..39552de6e1db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ diff --git a/mm/util.c b/mm/util.c index 808f375648e7..a24aa22f2473 100644 --- a/mm/util.c +++ b/mm/util.c @@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) return mapping; } +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; } -- cgit v1.2.3 From 0c740d0afc3bff0a097ad03a1c8df92757516f5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jan 2014 15:49:56 -0800 Subject: introduce for_each_thread() to replace the buggy while_each_thread() while_each_thread() and next_thread() should die, almost every lockless usage is wrong. 1. Unless g == current, the lockless while_each_thread() is not safe. while_each_thread(g, t) can loop forever if g exits, next_thread() can't reach the unhashed thread in this case. Note that this can happen even if g is the group leader, it can exec. 2. Even if while_each_thread() itself was correct, people often use it wrongly. It was never safe to just take rcu_read_lock() and loop unless you verify that pid_alive(g) == T, even the first next_thread() can point to the already freed/reused memory. This patch adds signal_struct->thread_head and task->thread_node to create the normal rcu-safe list with the stable head. The new for_each_thread(g, t) helper is always safe under rcu_read_lock() as long as this task_struct can't go away. Note: of course it is ugly to have both task_struct->thread_node and the old task_struct->thread_group, we will kill it later, after we change the users of while_each_thread() to use for_each_thread(). Perhaps we can kill it even before we convert all users, we can reimplement next_thread(t) using the new thread_head/thread_node. But we can't do this right now because this will lead to subtle behavioural changes. For example, do/while_each_thread() always sees at least one task, while for_each_thread() can do nothing if the whole thread group has died. Or thread_group_empty(), currently its semantics is not clear unless thread_group_leader(p) and we need to audit the callers before we can change it. So this patch adds the new interface which has to coexist with the old one for some time, hopefully the next changes will be more or less straightforward and the old one will go away soon. Signed-off-by: Oleg Nesterov Reviewed-by: Sergey Dyasly Tested-by: Sergey Dyasly Reviewed-by: Sameer Nanda Acked-by: David Rientjes Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Mandeep Singh Baines Cc: "Ma, Xindong" Cc: Michal Hocko Cc: "Tu, Xiaobing" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 2 ++ include/linux/sched.h | 12 ++++++++++++ kernel/exit.c | 1 + kernel/fork.c | 7 +++++++ 4 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f0e52383a001..1516a8ff8f92 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -41,6 +41,7 @@ extern struct fs_struct init_fs; #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ + .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ .shared_pending = { \ .list = LIST_HEAD_INIT(sig.shared_pending.list), \ @@ -222,6 +223,7 @@ extern struct task_group root_task_group; [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ }, \ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ + .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ INIT_IDS \ INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ diff --git a/include/linux/sched.h b/include/linux/sched.h index ffccdad050b5..485234d2fd42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -1271,6 +1272,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -2341,6 +2343,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..1e77fc645317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 294189fc7ac8..2f11bbe376b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1035,6 +1035,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1474,6 +1479,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; -- cgit v1.2.3 From 9da791dfabc60218c81904c7906b45789466e68e Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:23 -0800 Subject: kernel/printk/printk.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Signed-off-by: Grygorii Strashko Signed-off-by: Santosh Shilimkar Cc: Yinghai Lu Cc: Tejun Heo Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86bae576..f8b41bddc6dc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -757,14 +757,10 @@ void __init setup_log_buf(int early) return; if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); + new_log_buf = + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); } if (unlikely(!new_log_buf)) { -- cgit v1.2.3 From c2f69cdafebb3a46e43b5ac57ca12b539a2c790f Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 21 Jan 2014 15:50:27 -0800 Subject: kernel/power/snapshot.c: use memblock apis for early memory allocations Switch to memblock interfaces for early memory allocator instead of bootmem allocator. No functional change in beahvior than what it is in current code from bootmem users points of view. Archs already converted to NO_BOOTMEM now directly use memblock interfaces instead of bootmem wrappers build on top of memblock. And the archs which still uses bootmem, these new apis just fallback to exiting bootmem APIs. Acked-by: "Rafael J. Wysocki" Signed-off-by: Santosh Shilimkar Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Greg Kroah-Hartman Cc: Grygorii Strashko Cc: H. Peter Anvin Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Paul Walmsley Cc: Pavel Machek Cc: Russell King Cc: Tejun Heo Cc: Tony Lindgren Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e204af..d9f61a145802 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); -- cgit v1.2.3 From 286549dcaf4f128cb04f0ad56dfb677d7d19b500 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 15:51:03 -0800 Subject: sched: add tracepoints related to NUMA task migration This patch adds three tracepoints o trace_sched_move_numa when a task is moved to a node o trace_sched_swap_numa when a task is swapped with another task o trace_sched_stick_numa when a numa-related migration fails The tracepoints allow the NUMA scheduler activity to be monitored and the following high-level metrics can be calculated o NUMA migrated stuck nr trace_sched_stick_numa o NUMA migrated idle nr trace_sched_move_numa o NUMA migrated swapped nr trace_sched_swap_numa o NUMA local swapped trace_sched_swap_numa src_nid == dst_nid (should never happen) o NUMA remote swapped trace_sched_swap_numa src_nid != dst_nid (should == NUMA migrated swapped) o NUMA group swapped trace_sched_swap_numa src_ngid == dst_ngid Maybe a small number of these are acceptable but a high number would be a major surprise. It would be even worse if bounces are frequent. o NUMA avg task migs. Average number of migrations for tasks o NUMA stddev task mig Self-explanatory o NUMA max task migs. Maximum number of migrations for a single task In general the intent of the tracepoints is to help diagnose problems where automatic NUMA balancing appears to be doing an excessive amount of useless work. [akpm@linux-foundation.org: remove semicolon-after-if, repair coding-style] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/sched.h | 87 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 2 + kernel/sched/fair.c | 6 ++- 3 files changed, 94 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 04c308413a5d..67e1bbf83695 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -443,6 +443,93 @@ TRACE_EVENT(sched_process_hang, ); #endif /* CONFIG_DETECT_HUNG_TASK */ +DECLARE_EVENT_CLASS(sched_move_task_template, + + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, pid ) + __field( pid_t, tgid ) + __field( pid_t, ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->pid = task_pid_nr(tsk); + __entry->tgid = task_tgid_nr(tsk); + __entry->ngid = task_numa_group_id(tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", + __entry->pid, __entry->tgid, __entry->ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_cpu, __entry->dst_nid) +); + +/* + * Tracks migration of tasks from one runqueue to another. Can be used to + * detect if automatic NUMA balancing is bouncing between nodes + */ +DEFINE_EVENT(sched_move_task_template, sched_move_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +DEFINE_EVENT(sched_move_task_template, sched_stick_numa, + TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), + + TP_ARGS(tsk, src_cpu, dst_cpu) +); + +TRACE_EVENT(sched_swap_numa, + + TP_PROTO(struct task_struct *src_tsk, int src_cpu, + struct task_struct *dst_tsk, int dst_cpu), + + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), + + TP_STRUCT__entry( + __field( pid_t, src_pid ) + __field( pid_t, src_tgid ) + __field( pid_t, src_ngid ) + __field( int, src_cpu ) + __field( int, src_nid ) + __field( pid_t, dst_pid ) + __field( pid_t, dst_tgid ) + __field( pid_t, dst_ngid ) + __field( int, dst_cpu ) + __field( int, dst_nid ) + ), + + TP_fast_assign( + __entry->src_pid = task_pid_nr(src_tsk); + __entry->src_tgid = task_tgid_nr(src_tsk); + __entry->src_ngid = task_numa_group_id(src_tsk); + __entry->src_cpu = src_cpu; + __entry->src_nid = cpu_to_node(src_cpu); + __entry->dst_pid = task_pid_nr(dst_tsk); + __entry->dst_tgid = task_tgid_nr(dst_tsk); + __entry->dst_ngid = task_numa_group_id(dst_tsk); + __entry->dst_cpu = dst_cpu; + __entry->dst_nid = cpu_to_node(dst_cpu); + ), + + TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", + __entry->src_pid, __entry->src_tgid, __entry->src_ngid, + __entry->src_cpu, __entry->src_nid, + __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, + __entry->dst_cpu, __entry->dst_nid) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36c951b7eef8..5ae36cc11fe5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -4603,6 +4604,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b24b6cfde9aa..867b0a4b0893 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1250,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p) p->numa_scan_period = task_scan_min(p); if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); return ret; } -- cgit v1.2.3