From 3fe0313e6ec572e6bb3f9d247316a834336db4be Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 26 Oct 2008 20:50:26 +0100 Subject: Hibernate: Call platform_begin before swsusp_shrink_memory Call platform_begin() before swsusp_shrink_memory() so that we can always allocate enough memory to save the ACPI NVS region from platform_begin(). Signed-off-by: Zhang Rui Acked-by: Nigel Cunningham Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/disk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index c9d74083746f..096fe4899ea4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -259,12 +259,12 @@ int hibernation_snapshot(int platform_mode) { int error, ftrace_save; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + error = platform_begin(platform_mode); if (error) return error; - error = platform_begin(platform_mode); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); if (error) goto Close; -- cgit v1.2.3 From 3f4b0ef7f2899c91b1d6958779f084b44dd59d32 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 26 Oct 2008 20:52:15 +0100 Subject: ACPI hibernate: Add a mechanism to save/restore ACPI NVS memory According to the ACPI Specification 3.0b, Section 15.3.2, "OSPM will call the _PTS control method some time before entering a sleeping state, to allow the platform's AML code to update this memory image before entering the sleeping state. After the system awakes from an S4 state, OSPM will restore this memory area and call the _WAK control method to enable the BIOS to reclaim its memory image." For this reason, implement a mechanism allowing us to save the NVS memory during hibernation and to restore it during the subsequent resume. Based on a patch by Zhang Rui. Signed-off-by: Rafael J. Wysocki Acked-by: Nigel Cunningham Cc: Zhang Rui Signed-off-by: Len Brown --- drivers/acpi/sleep/main.c | 53 +++++++++++++++++--- include/linux/suspend.h | 13 +++++ kernel/power/swsusp.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c index 28a691cc625e..45a8015e4217 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -394,9 +394,25 @@ void __init acpi_no_s4_hw_signature(void) static int acpi_hibernation_begin(void) { - acpi_target_sleep_state = ACPI_STATE_S4; - acpi_sleep_tts_switch(acpi_target_sleep_state); - return 0; + int error; + + error = hibernate_nvs_alloc(); + if (!error) { + acpi_target_sleep_state = ACPI_STATE_S4; + acpi_sleep_tts_switch(acpi_target_sleep_state); + } + + return error; +} + +static int acpi_hibernation_pre_snapshot(void) +{ + int error = acpi_pm_prepare(); + + if (!error) + hibernate_nvs_save(); + + return error; } static int acpi_hibernation_enter(void) @@ -417,6 +433,12 @@ static int acpi_hibernation_enter(void) return ACPI_SUCCESS(status) ? 0 : -EFAULT; } +static void acpi_hibernation_finish(void) +{ + hibernate_nvs_free(); + acpi_pm_finish(); +} + static void acpi_hibernation_leave(void) { /* @@ -432,6 +454,8 @@ static void acpi_hibernation_leave(void) "cannot resume!\n"); panic("ACPI S4 hardware signature mismatch"); } + /* Restore the NVS memory area */ + hibernate_nvs_restore(); } static void acpi_pm_enable_gpes(void) @@ -442,8 +466,8 @@ static void acpi_pm_enable_gpes(void) static struct platform_hibernation_ops acpi_hibernation_ops = { .begin = acpi_hibernation_begin, .end = acpi_pm_end, - .pre_snapshot = acpi_pm_prepare, - .finish = acpi_pm_finish, + .pre_snapshot = acpi_hibernation_pre_snapshot, + .finish = acpi_hibernation_finish, .prepare = acpi_pm_prepare, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, @@ -469,8 +493,21 @@ static int acpi_hibernation_begin_old(void) error = acpi_sleep_prepare(ACPI_STATE_S4); + if (!error) { + error = hibernate_nvs_alloc(); + if (!error) + acpi_target_sleep_state = ACPI_STATE_S4; + } + return error; +} + +static int acpi_hibernation_pre_snapshot_old(void) +{ + int error = acpi_pm_disable_gpes(); + if (!error) - acpi_target_sleep_state = ACPI_STATE_S4; + hibernate_nvs_save(); + return error; } @@ -481,8 +518,8 @@ static int acpi_hibernation_begin_old(void) static struct platform_hibernation_ops acpi_hibernation_ops_old = { .begin = acpi_hibernation_begin_old, .end = acpi_pm_end, - .pre_snapshot = acpi_pm_disable_gpes, - .finish = acpi_pm_finish, + .pre_snapshot = acpi_hibernation_pre_snapshot_old, + .finish = acpi_hibernation_finish, .prepare = acpi_pm_disable_gpes, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 2ce8207686e2..2b409c44db83 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -232,6 +232,11 @@ extern unsigned long get_safe_page(gfp_t gfp_mask); extern void hibernation_set_ops(struct platform_hibernation_ops *ops); extern int hibernate(void); +extern int hibernate_nvs_register(unsigned long start, unsigned long size); +extern int hibernate_nvs_alloc(void); +extern void hibernate_nvs_free(void); +extern void hibernate_nvs_save(void); +extern void hibernate_nvs_restore(void); #else /* CONFIG_HIBERNATION */ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} @@ -239,6 +244,14 @@ static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } +static inline int hibernate_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +static inline int hibernate_nvs_alloc(void) { return 0; } +static inline void hibernate_nvs_free(void) {} +static inline void hibernate_nvs_save(void) {} +static inline void hibernate_nvs_restore(void) {} #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_PM_SLEEP diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 023ff2a31d89..a92c91451559 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -262,3 +262,125 @@ int swsusp_shrink_memory(void) return 0; } + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} -- cgit v1.2.3 From 69643279a88dea000ac2f858091d0e365f778245 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Nov 2008 21:32:44 +0100 Subject: Hibernate: Do not oops on resume if image data are incorrect During resume from hibernation using the userland interface image data are being passed from the used space process to the kernel. These data need not be valid, but currently the kernel sometimes oopses if it gets invalid image data, which is wrong. Make the kernel return error codes to the user space in such cases. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/snapshot.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5d2ab836e998..955c8cc91838 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -519,6 +519,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -1459,9 +1467,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1469,8 +1475,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1608,7 +1619,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1677,7 +1688,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1788,8 +1799,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1805,7 +1821,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,7 +1884,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return error; } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + if (handle->prev == nr_meta_pages) { error = prepare_image(&orig_bm, ©_bm); if (error) @@ -1879,12 +1898,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); if (handle->buffer != buffer) handle->sync_read = 0; } -- cgit v1.2.3 From 846705deb059c352cc0e5806d5964f815b8c6d98 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Nov 2008 18:00:24 -0500 Subject: Hibernate: Take overlapping zones into account (rev. 2) It has been requested to make hibernation work with memory hotplugging enabled and for this purpose the hibernation code has to be reworked to take the possible overlapping of zones into account. Thus, rework the hibernation memory bitmaps code to prevent duplication of PFNs from occuring and add checks to make sure that one page frame will not be marked as saveable many times. Additionally, use list.h lists instead of open-coded lists to implement the memory bitmaps. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 325 +++++++++++++++++++++++++----------------------- 1 file changed, 166 insertions(+), 159 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 955c8cc91838..ec9f153b2fc2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ unsigned long *data; /* bitmap representing pages */ @@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -273,11 +259,7 @@ struct memory_bitmap { static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; } @@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @nr_blocks - number of blocks to allocate + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; + + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + if (!populated_zone(zone)) + continue; + + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; - zbm->next = zbmlist; - zbmlist = zbm; + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; *bit_nr = pfn; *addr = bb->data; return 0; @@ -538,29 +548,21 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; @@ -816,8 +818,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -825,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -854,13 +857,16 @@ unsigned int count_highmem_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -871,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -880,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -911,7 +918,7 @@ unsigned int count_data_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -952,7 +959,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -983,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { -- cgit v1.2.3 From baa5835df10254762aedb6cb23a9c1508f969736 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 8 Dec 2008 00:52:49 +0100 Subject: Hibernate: Replace unnecessary evaluation of pfn_to_page() Replace one evaluation of pfn_to_page() in copy_data_pages() with the value of a local variable containing the right number already. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ec9f153b2fc2..f5fc2d7680f2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -981,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + dst = kmap_atomic(d_page, KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { -- cgit v1.2.3 From edb123e16c6092bd08b67d1130ff03efeada0c89 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 4 Dec 2008 12:39:49 +0100 Subject: trivial: printk: fix indentation of new_text_line declaration Remove bogus indentation of new_text_line declaration introduced in commit ac60ad741. Acked-by: Nick Andrew Signed-off-by: Jiri Kosina --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e651ab05655f..7015733793e8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) static const char recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int recursion_bug; - static int new_text_line = 1; +static int new_text_line = 1; static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) -- cgit v1.2.3 From 025dfdafe77f20b3890981a394774baab7b9c827 Mon Sep 17 00:00:00 2001 From: Frederik Schwarzer Date: Thu, 16 Oct 2008 19:02:37 +0200 Subject: trivial: fix then -> than typos in comments and documentation - (better, more, bigger ...) then -> (...) than Signed-off-by: Frederik Schwarzer Signed-off-by: Jiri Kosina --- Documentation/hwmon/abituguru-datasheet | 6 +++--- Documentation/networking/rxrpc.txt | 2 +- Documentation/scsi/ChangeLog.lpfc | 2 +- arch/blackfin/kernel/kgdb.c | 2 +- arch/ia64/kernel/kprobes.c | 2 +- arch/m68k/Kconfig | 2 +- arch/mips/pmc-sierra/yosemite/atmel_read_eeprom.c | 2 +- arch/powerpc/kernel/kprobes.c | 2 +- arch/powerpc/oprofile/cell/spu_profiler.c | 2 +- arch/s390/Kconfig | 2 +- arch/s390/kernel/kprobes.c | 2 +- arch/sparc/kernel/kprobes.c | 2 +- arch/x86/kernel/kprobes.c | 2 +- arch/x86/kernel/mfgpt_32.c | 2 +- drivers/hwmon/fschmd.c | 2 +- drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/message/i2o/i2o_scsi.c | 2 +- drivers/mtd/devices/pmc551.c | 2 +- drivers/mtd/ubi/eba.c | 2 +- drivers/mtd/ubi/io.c | 2 +- drivers/mtd/ubi/scan.c | 2 +- drivers/mtd/ubi/ubi-media.h | 4 ++-- drivers/mtd/ubi/vtbl.c | 2 +- drivers/mtd/ubi/wl.c | 4 ++-- drivers/net/bnx2x_link.c | 2 +- drivers/net/e1000/e1000_hw.c | 4 ++-- drivers/net/slip.h | 2 +- drivers/net/tehuti.c | 4 ++-- drivers/net/tokenring/smctr.c | 2 +- drivers/net/wireless/ipw2x00/ipw2100.c | 2 +- drivers/net/wireless/rt2x00/rt2x00crypto.c | 4 ++-- drivers/net/wireless/strip.c | 2 +- drivers/s390/block/dasd_eer.c | 4 ++-- drivers/s390/char/vmlogrdr.c | 4 ++-- drivers/scsi/lpfc/lpfc_hbadisc.c | 4 ++-- drivers/scsi/lpfc/lpfc_sli.c | 10 +++++----- drivers/serial/crisv10.c | 4 ++-- drivers/video/console/vgacon.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/proc/task_nommu.c | 2 +- fs/ubifs/Kconfig | 2 +- fs/ubifs/budget.c | 4 ++-- fs/ubifs/gc.c | 2 +- fs/ubifs/journal.c | 2 +- fs/ubifs/shrinker.c | 2 +- fs/xfs/linux-2.6/xfs_super.c | 2 +- include/linux/mtd/mtd.h | 2 +- include/linux/spi/spi.h | 4 ++-- include/mtd/ubi-user.h | 2 +- kernel/pid.c | 2 +- kernel/time/jiffies.c | 2 +- net/sctp/auth.c | 4 ++-- net/sctp/sm_statefuns.c | 6 +++--- net/sctp/socket.c | 2 +- net/sctp/tsnmap.c | 2 +- sound/usb/usx2y/usbusx2y.c | 2 +- 56 files changed, 76 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/Documentation/hwmon/abituguru-datasheet b/Documentation/hwmon/abituguru-datasheet index aef5a9b36846..4d184f2db0ea 100644 --- a/Documentation/hwmon/abituguru-datasheet +++ b/Documentation/hwmon/abituguru-datasheet @@ -74,7 +74,7 @@ a sensor. Notice that some banks have both a read and a write address this is how the uGuru determines if a read from or a write to the bank is taking place, thus when reading you should always use the read address and when writing the -write address. The write address is always one (1) more then the read address. +write address. The write address is always one (1) more than the read address. uGuru ready @@ -224,7 +224,7 @@ Bit 3: Beep if alarm (RW) Bit 4: 1 if alarm cause measured temp is over the warning threshold (R) Bit 5: 1 if alarm cause measured volt is over the max threshold (R) Bit 6: 1 if alarm cause measured volt is under the min threshold (R) -Bit 7: Volt sensor: Shutdown if alarm persist for more then 4 seconds (RW) +Bit 7: Volt sensor: Shutdown if alarm persist for more than 4 seconds (RW) Temp sensor: Shutdown if temp is over the shutdown threshold (RW) * This bit is only honored/used by the uGuru if a temp sensor is connected @@ -293,7 +293,7 @@ Byte 0: Alarm behaviour for the selected sensor. A 1 enables the described behaviour. Bit 0: Give an alarm if measured rpm is under the min threshold (RW) Bit 3: Beep if alarm (RW) -Bit 7: Shutdown if alarm persist for more then 4 seconds (RW) +Bit 7: Shutdown if alarm persist for more than 4 seconds (RW) Byte 1: min threshold (scale as bank 0x26) diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index c3669a3fb4af..60d05eb77c64 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -540,7 +540,7 @@ A client would issue an operation by: MSG_MORE should be set in msghdr::msg_flags on all but the last part of the request. Multiple requests may be made simultaneously. - If a call is intended to go to a destination other then the default + If a call is intended to go to a destination other than the default specified through connect(), then msghdr::msg_name should be set on the first request message of that call. diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc index ae3f962a7cfc..ff19a52fe004 100644 --- a/Documentation/scsi/ChangeLog.lpfc +++ b/Documentation/scsi/ChangeLog.lpfc @@ -733,7 +733,7 @@ Changes from 20040920 to 20041018 I/O completion path a little more, especially taking care of fast-pathing the non-error case. Also removes tons of dead members and defines from lpfc_scsi.h - e.g. lpfc_target is down - to nothing more then the lpfc_nodelist pointer. + to nothing more than the lpfc_nodelist pointer. * Added binary sysfs file to issue mbox commands * Replaced #if __BIG_ENDIAN with #if __BIG_ENDIAN_BITFIELD for compatibility with the user space applications. diff --git a/arch/blackfin/kernel/kgdb.c b/arch/blackfin/kernel/kgdb.c index b795a207742c..1c5afaeb9504 100644 --- a/arch/blackfin/kernel/kgdb.c +++ b/arch/blackfin/kernel/kgdb.c @@ -105,7 +105,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) * Extracts ebp, esp and eip values understandable by gdb from the values * saved by switch_to. * thread.esp points to ebp. flags and ebp are pushed in switch_to hence esp - * prior to entering switch_to is 8 greater then the value that is saved. + * prior to entering switch_to is 8 greater than the value that is saved. * If switch_to changes, change following code appropriately. */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index f07688da947c..0017b9de2ddf 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -434,7 +434,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return + * have a return probe installed on them, and/or more than one return * return probe was registered for a target function. * * We can handle this because: diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index c825bde17cb3..fb87c08c6b57 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -303,7 +303,7 @@ config M68KFPU_EMU_EXTRAPREC correct rounding, the emulator can (often) do the same but this extra calculation can cost quite some time, so you can disable it here. The emulator will then "only" calculate with a 64 bit - mantissa and round slightly incorrect, what is more then enough + mantissa and round slightly incorrect, what is more than enough for normal usage. config M68KFPU_EMU_ONLY diff --git a/arch/mips/pmc-sierra/yosemite/atmel_read_eeprom.c b/arch/mips/pmc-sierra/yosemite/atmel_read_eeprom.c index 97862f45496d..caf5e9a0acc7 100644 --- a/arch/mips/pmc-sierra/yosemite/atmel_read_eeprom.c +++ b/arch/mips/pmc-sierra/yosemite/atmel_read_eeprom.c @@ -148,7 +148,7 @@ int read_eeprom(char *buffer, int eeprom_size, int size) send_byte(W_HEADER); recv_ack(); - /* EEPROM with size of more then 2K need two byte addressing */ + /* EEPROM with size of more than 2K need two byte addressing */ if (eeprom_size > 2048) { send_byte(0x00); recv_ack(); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index de79915452c8..b29005a5a8f5 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -316,7 +316,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return + * have a return probe installed on them, and/or more than one return * return probe was registered for a target function. * * We can handle this because: diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index dd499c3e9da7..83faa958b9d4 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@ -49,7 +49,7 @@ void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_rese * of precision. This is close enough for the purpose at hand. * * The value of the timeout should be small enough that the hw - * trace buffer will not get more then about 1/3 full for the + * trace buffer will not get more than about 1/3 full for the * maximum user specified (the LFSR value) hw sampling frequency. * This is to ensure the trace buffer will never fill even if the * kernel thread scheduling varies under a heavy system load. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 19577aeffd7b..a94a3c3ae932 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -299,7 +299,7 @@ config WARN_STACK This option enables the compiler options -mwarn-framesize and -mwarn-dynamicstack. If the compiler supports these options it will generate warnings for function which either use alloca or - create a stack frame bigger then CONFIG_WARN_STACK_SIZE. + create a stack frame bigger than CONFIG_WARN_STACK_SIZE. Say N if you are unsure. diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 569079ec4ff0..267f6698680a 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -381,7 +381,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return + * have a return probe installed on them, and/or more than one return * return probe was registered for a target function. * * We can handle this because: diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index 201a6e547e4a..3bc6527c95af 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -517,7 +517,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) /* * It is possible to have multiple instances associated with a given * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return + * have a return probe installed on them, and/or more than one return * return probe was registered for a target function. * * We can handle this because: diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 6c27679ec6aa..a116e6d5726c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -694,7 +694,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) /* * It is possible to have multiple instances associated with a given * task either because multiple functions in the call path have - * return probes installed on them, and/or more then one + * return probes installed on them, and/or more than one * return probe was registered for a target function. * * We can handle this because: diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index c12314c9e86f..8815f3c7fec7 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); /* * The MFPGT timers on the CS5536 provide us with suitable timers to use * as clock event sources - not as good as a HPET or APIC, but certainly - * better then the PIT. This isn't a general purpose MFGPT driver, but + * better than the PIT. This isn't a general purpose MFGPT driver, but * a simplified one designed specifically to act as a clock event source. * For full details about the MFGPT, please consult the CS5536 data sheet. */ diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c index 967170368933..8b2d756595d9 100644 --- a/drivers/hwmon/fschmd.c +++ b/drivers/hwmon/fschmd.c @@ -75,7 +75,7 @@ static const u8 FSCHMD_REG_VOLT[3] = { 0x45, 0x42, 0x48 }; /* minimum pwm at which the fan is driven (pwm can by increased depending on the temp. Notice that for the scy some fans share there minimum speed. - Also notice that with the scy the sensor order is different then with the + Also notice that with the scy the sensor order is different than with the other chips, this order was in the 2.4 driver and kept for consistency. */ static const u8 FSCHMD_REG_FAN_MIN[5][6] = { { 0x55, 0x65 }, /* pos */ diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index a3c5af1d7ec0..de5263beab4a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -367,7 +367,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) if (err) goto out; } else { - /* Can't be smaller then the number of outstanding CQEs */ + /* Can't be smaller than the number of outstanding CQEs */ outst_cqe = mlx4_ib_get_outstanding_cqes(cq); if (entries < outst_cqe + 1) { err = 0; diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c index 1bcdbbb9e7d3..3d45817e6dcd 100644 --- a/drivers/message/i2o/i2o_scsi.c +++ b/drivers/message/i2o/i2o_scsi.c @@ -390,7 +390,7 @@ static int i2o_scsi_reply(struct i2o_controller *c, u32 m, * @i2o_dev: the I2O device which was added * * If a I2O device is added we catch the notification, because I2O classes - * other then SCSI peripheral will not be received through + * other than SCSI peripheral will not be received through * i2o_scsi_probe(). */ static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev) diff --git a/drivers/mtd/devices/pmc551.c b/drivers/mtd/devices/pmc551.c index d38bca64bb15..d2fd550f7e09 100644 --- a/drivers/mtd/devices/pmc551.c +++ b/drivers/mtd/devices/pmc551.c @@ -34,7 +34,7 @@ * aperture size, not the dram size, and the V370PDC supplies no * other method for memory size discovery. This problem is * mostly only relevant when compiled as a module, as the - * unloading of the module with an aperture size smaller then + * unloading of the module with an aperture size smaller than * the ram will cause the driver to detect the onboard memory * size to be equal to the aperture size when the module is * reloaded. Soooo, to help, the module supports an msize diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index 048a606cebde..25def348e5ba 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -717,7 +717,7 @@ write_error: * to the real data size, although the @buf buffer has to contain the * alignment. In all other cases, @len has to be aligned. * - * It is prohibited to write more then once to logical eraseblocks of static + * It is prohibited to write more than once to logical eraseblocks of static * volumes. This function returns zero in case of success and a negative error * code in case of failure. */ diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index a74118c05745..fe81039f2a7c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -465,7 +465,7 @@ out: * This function synchronously erases physical eraseblock @pnum. If @torture * flag is not zero, the physical eraseblock is checked by means of writing * different patterns to it and reading them back. If the torturing is enabled, - * the physical eraseblock is erased more then once. + * the physical eraseblock is erased more than once. * * This function returns the number of erasures made in case of success, %-EIO * if the erasure failed or the torturing test failed, and other negative error diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c index 41d47e1cf15c..ecde202a5a12 100644 --- a/drivers/mtd/ubi/scan.c +++ b/drivers/mtd/ubi/scan.c @@ -478,7 +478,7 @@ int ubi_scan_add_used(struct ubi_device *ubi, struct ubi_scan_info *si, return 0; } else { /* - * This logical eraseblock is older then the one found + * This logical eraseblock is older than the one found * previously. */ if (cmp_res & 4) diff --git a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h index 2ad940409053..8419fdccc79c 100644 --- a/drivers/mtd/ubi/ubi-media.h +++ b/drivers/mtd/ubi/ubi-media.h @@ -135,7 +135,7 @@ enum { * The erase counter header takes 64 bytes and has a plenty of unused space for * future usage. The unused fields are zeroed. The @version field is used to * indicate the version of UBI implementation which is supposed to be able to - * work with this UBI image. If @version is greater then the current UBI + * work with this UBI image. If @version is greater than the current UBI * version, the image is rejected. This may be useful in future if something * is changed radically. This field is duplicated in the volume identifier * header. @@ -187,7 +187,7 @@ struct ubi_ec_hdr { * (sequence number) is used to distinguish between older and newer versions of * logical eraseblocks. * - * There are 2 situations when there may be more then one physical eraseblock + * There are 2 situations when there may be more than one physical eraseblock * corresponding to the same logical eraseblock, i.e., having the same @vol_id * and @lnum values in the volume identifier header. Suppose we have a logical * eraseblock L and it is mapped to the physical eraseblock P. diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index 333c8941552f..1afc61e7455d 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -577,7 +577,7 @@ static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si, if (vtbl[i].flags & UBI_VTBL_AUTORESIZE_FLG) { /* Auto re-size flag may be set only for one volume */ if (ubi->autoresize_vol_id != -1) { - ubi_err("more then one auto-resize volume (%d " + ubi_err("more than one auto-resize volume (%d " "and %d)", ubi->autoresize_vol_id, i); kfree(vol); return -EINVAL; diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 14901cb82c18..891534f8210d 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -128,7 +128,7 @@ * situation when the picked physical eraseblock is constantly erased after the * data is written to it. So, we have a constant which limits the highest erase * counter of the free physical eraseblock to pick. Namely, the WL sub-system - * does not pick eraseblocks with erase counter greater then the lowest erase + * does not pick eraseblocks with erase counter greater than the lowest erase * counter plus %WL_FREE_MAX_DIFF. */ #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) @@ -917,7 +917,7 @@ static int ensure_wear_leveling(struct ubi_device *ubi) /* * We schedule wear-leveling only if the difference between the * lowest erase counter of used physical eraseblocks and a high - * erase counter of free physical eraseblocks is greater then + * erase counter of free physical eraseblocks is greater than * %UBI_WL_THRESHOLD. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); diff --git a/drivers/net/bnx2x_link.c b/drivers/net/bnx2x_link.c index 67de94f1f30e..fefa6ab13064 100644 --- a/drivers/net/bnx2x_link.c +++ b/drivers/net/bnx2x_link.c @@ -3359,7 +3359,7 @@ static u8 bnx2x_format_ver(u32 num, u8 *str, u16 len) u8 shift = 8*4; u8 digit; if (len < 10) { - /* Need more then 10chars for this format */ + /* Need more than 10chars for this format */ *str_ptr = '\0'; return -EINVAL; } diff --git a/drivers/net/e1000/e1000_hw.c b/drivers/net/e1000/e1000_hw.c index d04eef53571e..e1a3fc1303ee 100644 --- a/drivers/net/e1000/e1000_hw.c +++ b/drivers/net/e1000/e1000_hw.c @@ -6758,7 +6758,7 @@ static s32 e1000_get_cable_length(struct e1000_hw *hw, u16 *min_length, * returns: - E1000_ERR_XXX * E1000_SUCCESS * - * For phy's older then IGP, this function simply reads the polarity bit in the + * For phy's older than IGP, this function simply reads the polarity bit in the * Phy Status register. For IGP phy's, this bit is valid only if link speed is * 10 Mbps. If the link speed is 100 Mbps there is no polarity so this bit will * return 0. If the link speed is 1000 Mbps the polarity status is in the @@ -6834,7 +6834,7 @@ static s32 e1000_check_polarity(struct e1000_hw *hw, * returns: - E1000_ERR_XXX * E1000_SUCCESS * - * For phy's older then IGP, this function reads the Downshift bit in the Phy + * For phy's older than IGP, this function reads the Downshift bit in the Phy * Specific Status register. For IGP phy's, it reads the Downgrade bit in the * Link Health register. In IGP this bit is latched high, so the driver must * read it immediately after link is established. diff --git a/drivers/net/slip.h b/drivers/net/slip.h index 853e0f6ec710..9ea5c11287d2 100644 --- a/drivers/net/slip.h +++ b/drivers/net/slip.h @@ -75,7 +75,7 @@ struct slip { unsigned long tx_errors; /* Planned stuff */ unsigned long rx_dropped; /* No memory for skb */ unsigned long tx_dropped; /* When MTU change */ - unsigned long rx_over_errors; /* Frame bigger then SLIP buf. */ + unsigned long rx_over_errors; /* Frame bigger than SLIP buf. */ #ifdef SL_INCLUDE_CSLIP unsigned long tx_compressed; unsigned long rx_compressed; diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c index a10a83a11d9f..a7a4dc4d6313 100644 --- a/drivers/net/tehuti.c +++ b/drivers/net/tehuti.c @@ -1004,7 +1004,7 @@ static inline void bdx_rxdb_free_elem(struct rxdb *db, int n) * skb for rx. It assumes that Rx is desabled in HW * funcs are grouped for better cache usage * - * RxD fifo is smaller then RxF fifo by design. Upon high load, RxD will be + * RxD fifo is smaller than RxF fifo by design. Upon high load, RxD will be * filled and packets will be dropped by nic without getting into host or * cousing interrupt. Anyway, in that condition, host has no chance to proccess * all packets, but dropping in nic is cheaper, since it takes 0 cpu cycles @@ -1826,7 +1826,7 @@ static void bdx_tx_free(struct bdx_priv *priv) * * Pushes desc to TxD fifo and overlaps it if needed. * NOTE: this func does not check for available space. this is responsibility - * of the caller. Neither does it check that data size is smaller then + * of the caller. Neither does it check that data size is smaller than * fifo size. */ static void bdx_tx_push_desc(struct bdx_priv *priv, void *data, int size) diff --git a/drivers/net/tokenring/smctr.c b/drivers/net/tokenring/smctr.c index a011666342ff..50eb29ce3c87 100644 --- a/drivers/net/tokenring/smctr.c +++ b/drivers/net/tokenring/smctr.c @@ -3064,7 +3064,7 @@ static int smctr_load_node_addr(struct net_device *dev) * will consequently cause a timeout. * * NOTE 1: If the monitor_state is MS_BEACON_TEST_STATE, all transmit - * queues other then the one used for the lobe_media_test should be + * queues other than the one used for the lobe_media_test should be * disabled.!? * * NOTE 2: If the monitor_state is MS_BEACON_TEST_STATE and the receive_mask diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c index 1667065b86a7..753de1a9c4b3 100644 --- a/drivers/net/wireless/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/ipw2x00/ipw2100.c @@ -1332,7 +1332,7 @@ static int ipw2100_power_cycle_adapter(struct ipw2100_priv *priv) IPW_AUX_HOST_RESET_REG_STOP_MASTER); /* Step 2. Wait for stop Master Assert - * (not more then 50us, otherwise ret error */ + * (not more than 50us, otherwise ret error */ i = 5; do { udelay(IPW_WAIT_RESET_MASTER_ASSERT_COMPLETE_DELAY); diff --git a/drivers/net/wireless/rt2x00/rt2x00crypto.c b/drivers/net/wireless/rt2x00/rt2x00crypto.c index 37ad0d2fb64c..aee9cba13eb3 100644 --- a/drivers/net/wireless/rt2x00/rt2x00crypto.c +++ b/drivers/net/wireless/rt2x00/rt2x00crypto.c @@ -184,8 +184,8 @@ void rt2x00crypto_rx_insert_iv(struct sk_buff *skb, unsigned int align, * Make room for new data, note that we increase both * headsize and tailsize when required. The tailsize is * only needed when ICV data needs to be inserted and - * the padding is smaller then the ICV data. - * When alignment requirements is greater then the + * the padding is smaller than the ICV data. + * When alignment requirements is greater than the * ICV data we must trim the skb to the correct size * because we need to remove the extra bytes. */ diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c index dd0de3a9ed4e..7015f2480550 100644 --- a/drivers/net/wireless/strip.c +++ b/drivers/net/wireless/strip.c @@ -236,7 +236,7 @@ struct strip { unsigned long tx_errors; /* Planned stuff */ unsigned long rx_dropped; /* No memory for skb */ unsigned long tx_dropped; /* When MTU change */ - unsigned long rx_over_errors; /* Frame bigger then STRIP buf. */ + unsigned long rx_over_errors; /* Frame bigger than STRIP buf. */ unsigned long pps_timer; /* Timer to determine pps */ unsigned long rx_pps_count; /* Counter to determine pps */ diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 892e2878d61b..f8e05ce98621 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -535,8 +535,8 @@ static int dasd_eer_open(struct inode *inp, struct file *filp) eerb->buffer_page_count > INT_MAX / PAGE_SIZE) { kfree(eerb); MESSAGE(KERN_WARNING, "can't open device since module " - "parameter eer_pages is smaller then 1 or" - " bigger then %d", (int)(INT_MAX / PAGE_SIZE)); + "parameter eer_pages is smaller than 1 or" + " bigger than %d", (int)(INT_MAX / PAGE_SIZE)); unlock_kernel(); return -EINVAL; } diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index aabbeb909cc6..d8a2289fcb69 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -427,7 +427,7 @@ static int vmlogrdr_receive_data(struct vmlogrdr_priv_t *priv) buffer = priv->buffer + sizeof(int); } /* - * If the record is bigger then our buffer, we receive only + * If the record is bigger than our buffer, we receive only * a part of it. We can get the rest later. */ if (iucv_data_count > NET_BUFFER_SIZE) @@ -437,7 +437,7 @@ static int vmlogrdr_receive_data(struct vmlogrdr_priv_t *priv) 0, buffer, iucv_data_count, &priv->residual_length); spin_unlock_bh(&priv->priv_lock); - /* An rc of 5 indicates that the record was bigger then + /* An rc of 5 indicates that the record was bigger than * the buffer, which is OK for us. A 9 indicates that the * record was purged befor we could receive it. */ diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 8c64494444bf..311ed6dea726 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -1964,10 +1964,10 @@ lpfc_set_disctmo(struct lpfc_vport *vport) uint32_t tmo; if (vport->port_state == LPFC_LOCAL_CFG_LINK) { - /* For FAN, timeout should be greater then edtov */ + /* For FAN, timeout should be greater than edtov */ tmo = (((phba->fc_edtov + 999) / 1000) + 1); } else { - /* Normal discovery timeout should be > then ELS/CT timeout + /* Normal discovery timeout should be > than ELS/CT timeout * FC spec states we need 3 * ratov for CT requests */ tmo = ((phba->fc_ratov * 3) + 3); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 01dfdc8696f8..a36a120561e2 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -420,7 +420,7 @@ lpfc_sli_next_iocb_slot (struct lpfc_hba *phba, struct lpfc_sli_ring *pring) if (unlikely(pring->local_getidx >= max_cmd_idx)) { lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "0315 Ring %d issue: portCmdGet %d " - "is bigger then cmd ring %d\n", + "is bigger than cmd ring %d\n", pring->ringno, pring->local_getidx, max_cmd_idx); @@ -1628,12 +1628,12 @@ lpfc_sli_rsp_pointers_error(struct lpfc_hba *phba, struct lpfc_sli_ring *pring) { struct lpfc_pgp *pgp = &phba->port_gp[pring->ringno]; /* - * Ring handler: portRspPut is bigger then + * Ring handler: portRspPut is bigger than * rsp ring */ lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "0312 Ring %d handler: portRspPut %d " - "is bigger then rsp ring %d\n", + "is bigger than rsp ring %d\n", pring->ringno, le32_to_cpu(pgp->rspPutInx), pring->numRiocb); @@ -2083,12 +2083,12 @@ lpfc_sli_handle_slow_ring_event(struct lpfc_hba *phba, portRspPut = le32_to_cpu(pgp->rspPutInx); if (portRspPut >= portRspMax) { /* - * Ring handler: portRspPut is bigger then + * Ring handler: portRspPut is bigger than * rsp ring */ lpfc_printf_log(phba, KERN_ERR, LOG_SLI, "0303 Ring %d handler: portRspPut %d " - "is bigger then rsp ring %d\n", + "is bigger than rsp ring %d\n", pring->ringno, portRspPut, portRspMax); phba->link_state = LPFC_HBA_ERROR; diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c index 8b2c619a09f2..e642c22c80e2 100644 --- a/drivers/serial/crisv10.c +++ b/drivers/serial/crisv10.c @@ -1203,7 +1203,7 @@ static void e100_disable_txdma_channel(struct e100_serial *info) unsigned long flags; /* Disable output DMA channel for the serial port in question - * ( set to something other then serialX) + * ( set to something other than serialX) */ local_irq_save(flags); DFLOW(DEBUG_LOG(info->line, "disable_txdma_channel %i\n", info->line)); @@ -1266,7 +1266,7 @@ static void e100_disable_rxdma_channel(struct e100_serial *info) unsigned long flags; /* Disable input DMA channel for the serial port in question - * ( set to something other then serialX) + * ( set to something other than serialX) */ local_irq_save(flags); if (info->line == 0) { diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c index e6210725b9ab..d012edda6d11 100644 --- a/drivers/video/console/vgacon.c +++ b/drivers/video/console/vgacon.c @@ -1332,7 +1332,7 @@ static void vgacon_save_screen(struct vc_data *c) c->vc_y = screen_info.orig_y; } - /* We can't copy in more then the size of the video buffer, + /* We can't copy in more than the size of the video buffer, * or we'll be copying in VGA BIOS */ if (!vga_is_gfx) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6ebaa58e2c03..04697ba7f73e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -854,7 +854,7 @@ static int o2hb_thread(void *data) while (!kthread_should_stop() && !reg->hr_unclean_stop) { /* We track the time spent inside - * o2hb_do_disk_heartbeat so that we avoid more then + * o2hb_do_disk_heartbeat so that we avoid more than * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 219bd79ea894..d4a8be32b902 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -9,7 +9,7 @@ /* * Logic: we've got two memory sums for each process, "shared", and - * "non-shared". Shared memory may get counted more then once, for + * "non-shared". Shared memory may get counted more than once, for * each process that owns it. Non-shared memory is counted * accurately. */ diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 91ceeda7e5bf..e35b54d5059d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB depends on UBIFS_FS default y help - Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. # Debugging-related stuff config UBIFS_FS_DEBUG diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 0e5e54d82924..175f9c590b77 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c) * * This function is called when an operation cannot be budgeted because there * is supposedly no free space. But in most cases there is some free space: - * o budgeting is pessimistic, so it always budgets more then it is actually + * o budgeting is pessimistic, so it always budgets more than it is actually * needed, so shrinking the liability is one way to make free space - the * cached data will take less space then it was budgeted for; * o GC may turn some dark space into free space (budgeting treats dark space @@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) * @c: UBIFS file-system description object * * This function converts budget which was allocated for a new page of data to - * the budget of changing an existing page of data. The latter is smaller then + * the budget of changing an existing page of data. The latter is smaller than * the former, so this function only does simple re-calculation and does not * involve any write-back. */ diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 0bef6501d58a..9832f9abe28e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -45,7 +45,7 @@ #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ /* - * GC may need to move more then one LEB to make progress. The below constants + * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. */ diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 10ae25b7d1db..9b7c54e0cd2a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -191,7 +191,7 @@ again: if (wbuf->lnum != -1 && avail >= len) { /* * Someone else has switched the journal head and we have - * enough space now. This happens when more then one process is + * enough space now. This happens when more than one process is * trying to write to the same journal head at the same time. */ dbg_jnl("return LEB %d back, already have LEB %d:%d", diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f248533841a2..e7bab52a1410 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean - * znodes which are older then @age, until at least @nr znodes are freed. + * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 36f6cc703ef2..be846d606ae8 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1348,7 +1348,7 @@ xfs_finish_flags( { int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller then the log stripe */ + /* Fail a mount where the logbuf is smaller than the log stripe */ if (xfs_sb_version_haslogv2(&mp->m_sb)) { if (mp->m_logbsize <= 0 && mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index eae26bb6430a..64433eb411d7 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -83,7 +83,7 @@ typedef enum { * @datbuf: data buffer - if NULL only oob data are read/written * @oobbuf: oob data buffer * - * Note, it is allowed to read more then one OOB area at one go, but not write. + * Note, it is allowed to read more than one OOB area at one go, but not write. * The interface assumes that the OOB write requests program only one page's * OOB area. */ diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 82229317753d..68bb1c501d0d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -327,9 +327,9 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum); * @tx_dma: DMA address of tx_buf, if @spi_message.is_dma_mapped * @rx_dma: DMA address of rx_buf, if @spi_message.is_dma_mapped * @len: size of rx and tx buffers (in bytes) - * @speed_hz: Select a speed other then the device default for this + * @speed_hz: Select a speed other than the device default for this * transfer. If 0 the default (from @spi_device) is used. - * @bits_per_word: select a bits_per_word other then the device default + * @bits_per_word: select a bits_per_word other than the device default * for this transfer. If 0 the default (from @spi_device) is used. * @cs_change: affects chipselect after this transfer completes * @delay_usecs: microseconds to delay after this transfer before diff --git a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h index ccdc562e444e..2dc2eb2b8e22 100644 --- a/include/mtd/ubi-user.h +++ b/include/mtd/ubi-user.h @@ -253,7 +253,7 @@ struct ubi_mkvol_req { * * Re-sizing is possible for both dynamic and static volumes. But while dynamic * volumes may be re-sized arbitrarily, static volumes cannot be made to be - * smaller then the number of bytes they bear. To arbitrarily shrink a static + * smaller than the number of bytes they bear. To arbitrarily shrink a static * volume, it must be wiped out first (by means of volume update operation with * zero number of bytes). */ diff --git a/kernel/pid.c b/kernel/pid.c index 064e76afa507..af9224cdd6c0 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -475,7 +475,7 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) EXPORT_SYMBOL(task_session_nr_ns); /* - * Used by proc to find the first pid that is greater then or equal to nr. + * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1ca99557e929..06f197560f3b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -45,7 +45,7 @@ * * The value 8 is somewhat carefully chosen, as anything * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ #define JIFFIES_SHIFT 8 diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 52db5f60daa0..20c576f530fa 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -141,8 +141,8 @@ void sctp_auth_destroy_keys(struct list_head *keys) /* Compare two byte vectors as numbers. Return values * are: * 0 - vectors are equal - * < 0 - vector 1 is smaller then vector2 - * > 0 - vector 1 is greater then vector2 + * < 0 - vector 1 is smaller than vector2 + * > 0 - vector 1 is greater than vector2 * * Algorithm is: * This is performed by selecting the numerically smaller key vector... diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 1c4e5d6c29c0..3a0cd075914f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4268,9 +4268,9 @@ nomem: /* * Handle a protocol violation when the chunk length is invalid. - * "Invalid" length is identified as smaller then the minimal length a + * "Invalid" length is identified as smaller than the minimal length a * given chunk can be. For example, a SACK chunk has invalid length - * if it's length is set to be smaller then the size of sctp_sack_chunk_t. + * if its length is set to be smaller than the size of sctp_sack_chunk_t. * * We inform the other end by sending an ABORT with a Protocol Violation * error code. @@ -4300,7 +4300,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen( /* * Handle a protocol violation when the parameter length is invalid. - * "Invalid" length is identified as smaller then the minimal length a + * "Invalid" length is identified as smaller than the minimal length a * given parameter can be. */ static sctp_disposition_t sctp_sf_violation_paramlen( diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b14a8f33e42d..ff0a8f88de04 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2717,7 +2717,7 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o paths++; } - /* Only validate asocmaxrxt if we have more then + /* Only validate asocmaxrxt if we have more than * one path/transport. We do this because path * retransmissions are only counted when we have more * then one path. diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index 35c73e82553a..9bd64565021a 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -227,7 +227,7 @@ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) */ bitmap_zero(map->tsn_map, map->len); } else { - /* If the gap is smaller then the map size, + /* If the gap is smaller than the map size, * shift the map by 'gap' bits and update further. */ bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len); diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c index ca26c532e77e..11639bd72a51 100644 --- a/sound/usb/usx2y/usbusx2y.c +++ b/sound/usb/usx2y/usbusx2y.c @@ -238,7 +238,7 @@ static void i_usX2Y_In04Int(struct urb *urb) send = 0; for (j = 0; j < URBS_AsyncSeq && !err; ++j) if (0 == usX2Y->AS04.urb[j]->status) { - struct us428_p4out *p4out = us428ctls->p4out + send; // FIXME if more then 1 p4out is new, 1 gets lost. + struct us428_p4out *p4out = us428ctls->p4out + send; // FIXME if more than 1 p4out is new, 1 gets lost. usb_fill_bulk_urb(usX2Y->AS04.urb[j], usX2Y->chip.dev, usb_sndbulkpipe(usX2Y->chip.dev, 0x04), &p4out->val.vol, p4out->type == eLT_Light ? sizeof(struct us428_lights) : 5, -- cgit v1.2.3 From da8d5089da6dfd54e5fd05d0c291a63c2bcf6885 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Jan 2009 15:28:57 +0100 Subject: sched: fix possible recursive rq->lock Vaidyanathan Srinivasan reported: > ============================================= > [ INFO: possible recursive locking detected ] > 2.6.28-autotest-tip-sv #1 > --------------------------------------------- > klogd/5062 is trying to acquire lock: > (&rq->lock){++..}, at: [] task_rq_lock+0x45/0x7e > > but task is already holding lock: > (&rq->lock){++..}, at: [] schedule+0x158/0xa31 With sched_mc at 2. (it is default-off) Strictly speaking we'll not deadlock, because ttwu will not be able to place the migration task on our rq, but since the code can deal with both rqs getting unlocked, this seems the easiest way out. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2e3545f57e77..deb5ac8c12f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3728,8 +3728,13 @@ redo: } double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; -- cgit v1.2.3 From 22a9d645677feefd402befd02edd59b122289ef1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 7 Jan 2009 08:45:46 -0800 Subject: async: Asynchronous function calls to speed up kernel boot Right now, most of the kernel boot is strictly synchronous, such that various hardware delays are done sequentially. In order to make the kernel boot faster, this patch introduces infrastructure to allow doing some of the initialization steps asynchronously, which will hide significant portions of the hardware delays in practice. In order to not change device order and other similar observables, this patch does NOT do full parallel initialization. Rather, it operates more in the way an out of order CPU does; the work may be done out of order and asynchronous, but the observable effects (instruction retiring for the CPU) are still done in the original sequence. Signed-off-by: Arjan van de Ven --- include/linux/async.h | 25 ++++ init/do_mounts.c | 2 + init/main.c | 5 +- kernel/Makefile | 3 +- kernel/async.c | 321 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/autoprobe.c | 5 + kernel/module.c | 2 + 7 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 include/linux/async.h create mode 100644 kernel/async.c (limited to 'kernel') diff --git a/include/linux/async.h b/include/linux/async.h new file mode 100644 index 000000000000..c4ecacd0b327 --- /dev/null +++ b/include/linux/async.h @@ -0,0 +1,25 @@ +/* + * async.h: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include + +typedef u64 async_cookie_t; +typedef void (async_func_ptr) (void *data, async_cookie_t cookie); + +extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data); +extern async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *list); +extern void async_synchronize_full(void); +extern void async_synchronize_full_special(struct list_head *list); +extern void async_synchronize_cookie(async_cookie_t cookie); +extern void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *list); + diff --git a/init/do_mounts.c b/init/do_mounts.c index 5efca73b39f9..708105e163df 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -372,6 +373,7 @@ void __init prepare_namespace(void) /* wait for the known devices to complete their probing */ while (driver_probe_done() != 0) msleep(100); + async_synchronize_full(); md_run_setup(); diff --git a/init/main.c b/init/main.c index b5a892c68375..f66715d8a853 100644 --- a/init/main.c +++ b/init/main.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -684,7 +685,7 @@ asmlinkage void __init start_kernel(void) rest_init(); } -static int initcall_debug; +int initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); int do_one_initcall(initcall_t fn) @@ -785,6 +786,8 @@ static void run_init_process(char *init_filename) */ static noinline int init_post(void) { + /* need to finish all async __init code before freeing the memory */ + async_synchronize_full(); free_initmem(); unlock_kernel(); mark_rodata_ro(); diff --git a/kernel/Makefile b/kernel/Makefile index e1c5bf3365c0..2921d90ce32f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 000000000000..afaa8a653d5a --- /dev/null +++ b/kernel/async.c @@ -0,0 +1,321 @@ +/* + * async.c: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* + +Goals and Theory of Operation + +The primary goal of this feature is to reduce the kernel boot time, +by doing various independent hardware delays and discovery operations +decoupled and not strictly serialized. + +More specifically, the asynchronous function call concept allows +certain operations (primarily during system boot) to happen +asynchronously, out of order, while these operations still +have their externally visible parts happen sequentially and in-order. +(not unlike how out-of-order CPUs retire their instructions in order) + +Key to the asynchronous function call implementation is the concept of +a "sequence cookie" (which, although it has an abstracted type, can be +thought of as a monotonically incrementing number). + +The async core will assign each scheduled event such a sequence cookie and +pass this to the called functions. + +The asynchronously called function should before doing a globally visible +operation, such as registering device numbers, call the +async_synchronize_cookie() function and pass in its own cookie. The +async_synchronize_cookie() function will make sure that all asynchronous +operations that were scheduled prior to the operation corresponding with the +cookie have completed. + +Subsystem/driver initialization code that scheduled asynchronous probe +functions, but which shares global resources with other drivers/subsystems +that do not use the asynchronous call feature, need to do a full +synchronization with the async_synchronize_full() function, before returning +from their init function. This is to maintain strict ordering between the +asynchronous and synchronous parts of the kernel. + +*/ + +#include +#include +#include +#include +#include +#include +#include + +static async_cookie_t next_cookie = 1; + +#define MAX_THREADS 256 +#define MAX_WORK 32768 + +static LIST_HEAD(async_pending); +static LIST_HEAD(async_running); +static DEFINE_SPINLOCK(async_lock); + +struct async_entry { + struct list_head list; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; +}; + +static DECLARE_WAIT_QUEUE_HEAD(async_done); +static DECLARE_WAIT_QUEUE_HEAD(async_new); + +static atomic_t entry_count; +static atomic_t thread_count; + +extern int initcall_debug; + + +/* + * MUST be called with the lock held! + */ +static async_cookie_t __lowest_in_progress(struct list_head *running) +{ + struct async_entry *entry; + if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, + struct async_entry, list); + return entry->cookie; + } else if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); + return entry->cookie; + } else { + /* nothing in progress... next_cookie is "infinity" */ + return next_cookie; + } + +} +/* + * pick the first pending entry and run it + */ +static void run_one_entry(void) +{ + unsigned long flags; + struct async_entry *entry; + ktime_t calltime, delta, rettime; + + /* 1) pick one task from the pending queue */ + + spin_lock_irqsave(&async_lock, flags); + if (list_empty(&async_pending)) + goto out; + entry = list_first_entry(&async_pending, struct async_entry, list); + + /* 2) move it to the running queue */ + list_del(&entry->list); + list_add_tail(&entry->list, &async_running); + spin_unlock_irqrestore(&async_lock, flags); + + /* 3) run it (and print duration)*/ + if (initcall_debug) { + printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + calltime = ktime_get(); + } + entry->func(entry->data, entry->cookie); + if (initcall_debug) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, + entry->func, ktime_to_ns(delta) >> 10); + } + + /* 4) remove it from the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_del(&entry->list); + + /* 5) free the entry */ + kfree(entry); + atomic_dec(&entry_count); + + spin_unlock_irqrestore(&async_lock, flags); + + /* 6) wake up any waiters. */ + wake_up(&async_done); + return; + +out: + spin_unlock_irqrestore(&async_lock, flags); +} + + +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +{ + struct async_entry *entry; + unsigned long flags; + async_cookie_t newcookie; + + + /* allow irq-off callers */ + entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); + + /* + * If we're out of memory or if there's too much work + * pending already, we execute synchronously. + */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + spin_lock_irqsave(&async_lock, flags); + newcookie = next_cookie++; + spin_unlock_irqrestore(&async_lock, flags); + + /* low on memory.. run synchronously */ + ptr(data, newcookie); + return newcookie; + } + entry->func = ptr; + entry->data = data; + entry->running = running; + + spin_lock_irqsave(&async_lock, flags); + newcookie = entry->cookie = next_cookie++; + list_add_tail(&entry->list, &async_pending); + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + wake_up(&async_new); + return newcookie; +} + +async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +{ + return __async_schedule(ptr, data, &async_pending); +} +EXPORT_SYMBOL_GPL(async_schedule); + +async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +{ + return __async_schedule(ptr, data, running); +} +EXPORT_SYMBOL_GPL(async_schedule_special); + +void async_synchronize_full(void) +{ + async_synchronize_cookie(next_cookie); +} +EXPORT_SYMBOL_GPL(async_synchronize_full); + +void async_synchronize_full_special(struct list_head *list) +{ + async_synchronize_cookie_special(next_cookie, list); +} +EXPORT_SYMBOL_GPL(async_synchronize_full_special); + +void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +{ + ktime_t starttime, delta, endtime; + + if (initcall_debug) { + printk("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); + } + + wait_event(async_done, __lowest_in_progress(running) >= cookie); + + if (initcall_debug) { + endtime = ktime_get(); + delta = ktime_sub(endtime, starttime); + + printk("async_continuing @ %i after %lli usec\n", + task_pid_nr(current), ktime_to_ns(delta) >> 10); + } +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); + +void async_synchronize_cookie(async_cookie_t cookie) +{ + async_synchronize_cookie_special(cookie, &async_running); +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie); + + +static int async_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int ret = HZ; + set_current_state(TASK_INTERRUPTIBLE); + /* + * check the list head without lock.. false positives + * are dealt with inside run_one_entry() while holding + * the lock. + */ + rmb(); + if (!list_empty(&async_pending)) + run_one_entry(); + else + ret = schedule_timeout(HZ); + + if (ret == 0) { + /* + * we timed out, this means we as thread are redundant. + * we sign off and die, but we to avoid any races there + * is a last-straw check to see if work snuck in. + */ + atomic_dec(&thread_count); + wmb(); /* manager must see our departure first */ + if (list_empty(&async_pending)) + break; + /* + * woops work came in between us timing out and us + * signing off; we need to stay alive and keep working. + */ + atomic_inc(&thread_count); + } + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int async_manager_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int tc, ec; + + set_current_state(TASK_INTERRUPTIBLE); + + tc = atomic_read(&thread_count); + rmb(); + ec = atomic_read(&entry_count); + + while (tc < ec && tc < MAX_THREADS) { + kthread_run(async_thread, NULL, "async/%i", tc); + atomic_inc(&thread_count); + tc++; + } + + schedule(); + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int __init async_init(void) +{ + kthread_run(async_manager_thread, NULL, "async/mgr"); + return 0; +} + +core_initcall(async_init); diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8ce..1de9700f416e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internals.h" @@ -34,6 +35,10 @@ unsigned long probe_irq_on(void) unsigned int status; int i; + /* + * quiesce the kernel, or at least the asynchronous portion + */ + async_synchronize_full(); mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to diff --git a/kernel/module.c b/kernel/module.c index 496dcb57b608..c9332c90d5a0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -50,6 +50,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -816,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + async_synchronize_full(); mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); -- cgit v1.2.3 From ad160d23198193135cb2bcc75222e0816b5838c0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 7 Jan 2009 09:28:53 -0800 Subject: async: don't do the initcall stuff post boot while tracking the asynchronous calls during boot using the initcall_debug convention is useful, doing it once the kernel is done is actually bad now that we use asynchronous operations post boot as well... Signed-off-by: Arjan van de Ven --- kernel/async.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index afaa8a653d5a..97373380c9e7 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -124,12 +124,12 @@ static void run_one_entry(void) spin_unlock_irqrestore(&async_lock, flags); /* 3) run it (and print duration)*/ - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, @@ -220,14 +220,14 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r { ktime_t starttime, delta, endtime; - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, __lowest_in_progress(running) >= cookie); - if (initcall_debug) { + if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); -- cgit v1.2.3 From e8de1481fd7126ee9e93d6889da6f00c05e1e019 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 22 Oct 2008 19:55:31 -0700 Subject: resource: allow MMIO exclusivity for device drivers Device drivers that use pci_request_regions() (and similar APIs) have a reasonable expectation that they are the only ones accessing their device. As part of the e1000e hunt, we were afraid that some userland (X or some bootsplash stuff) was mapping the MMIO region that the driver thought it had exclusively via /dev/mem or via various sysfs resource mappings. This patch adds the option for device drivers to cause their reserved regions to the "banned from /dev/mem use" list, so now both kernel memory and device-exclusive MMIO regions are banned. NOTE: This is only active when CONFIG_STRICT_DEVMEM is set. In addition to the config option, a kernel parameter iomem=relaxed is provided for the cases where developers want to diagnose, in the field, drivers issues from userspace. Reviewed-by: Matthew Wilcox Signed-off-by: Arjan van de Ven Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 4 ++ arch/x86/mm/init_32.c | 2 + arch/x86/mm/init_64.c | 2 + drivers/net/e1000e/netdev.c | 2 +- drivers/pci/pci-sysfs.c | 3 + drivers/pci/pci.c | 107 ++++++++++++++++++++++++++++++++---- include/linux/ioport.h | 11 +++- include/linux/pci.h | 3 + kernel/resource.c | 61 +++++++++++++++++++- 9 files changed, 176 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0b3f6711d2f1..0072fabb1dd1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -918,6 +918,10 @@ and is between 256 and 4096 characters. It is defined in the file inttest= [IA64] + iomem= Disable strict checking of access to MMIO memory + strict regions from userspace. + relaxed + iommu= [x86] off force diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 544d724caeee..88f1b10de3be 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -328,6 +328,8 @@ int devmem_is_allowed(unsigned long pagenr) { if (pagenr <= 256) return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; if (!page_is_ram(pagenr)) return 1; return 0; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 54c437e96541..23f68e77ad1f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -888,6 +888,8 @@ int devmem_is_allowed(unsigned long pagenr) { if (pagenr <= 256) return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; if (!page_is_ram(pagenr)) return 1; return 0; diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c index d4639facd1bd..91817d0afcaf 100644 --- a/drivers/net/e1000e/netdev.c +++ b/drivers/net/e1000e/netdev.c @@ -4807,7 +4807,7 @@ static int __devinit e1000_probe(struct pci_dev *pdev, } } - err = pci_request_selected_regions(pdev, + err = pci_request_selected_regions_exclusive(pdev, pci_select_bars(pdev, IORESOURCE_MEM), e1000e_driver_name); if (err) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 388440e0d222..d5cdccf27a69 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -620,6 +620,9 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, vma->vm_pgoff += start >> PAGE_SHIFT; mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; + if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(start)) + return -EINVAL; + return pci_mmap_page_range(pdev, vma, mmap_type, write_combine); } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2cfa41e367a7..47663dc0daf7 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1395,7 +1395,8 @@ void pci_release_region(struct pci_dev *pdev, int bar) * Returns 0 on success, or %EBUSY on error. A warning * message is also printed on failure. */ -int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) +static int __pci_request_region(struct pci_dev *pdev, int bar, const char *res_name, + int exclusive) { struct pci_devres *dr; @@ -1408,8 +1409,9 @@ int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) goto err_out; } else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { - if (!request_mem_region(pci_resource_start(pdev, bar), - pci_resource_len(pdev, bar), res_name)) + if (!__request_mem_region(pci_resource_start(pdev, bar), + pci_resource_len(pdev, bar), res_name, + exclusive)) goto err_out; } @@ -1427,6 +1429,47 @@ err_out: return -EBUSY; } +/** + * pci_request_region - Reserved PCI I/O and memory resource + * @pdev: PCI device whose resources are to be reserved + * @bar: BAR to be reserved + * @res_name: Name to be associated with resource. + * + * Mark the PCI region associated with PCI device @pdev BR @bar as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) +{ + return __pci_request_region(pdev, bar, res_name, 0); +} + +/** + * pci_request_region_exclusive - Reserved PCI I/O and memory resource + * @pdev: PCI device whose resources are to be reserved + * @bar: BAR to be reserved + * @res_name: Name to be associated with resource. + * + * Mark the PCI region associated with PCI device @pdev BR @bar as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + * + * The key difference that _exclusive makes it that userspace is + * explicitly not allowed to map the resource via /dev/mem or + * sysfs. + */ +int pci_request_region_exclusive(struct pci_dev *pdev, int bar, const char *res_name) +{ + return __pci_request_region(pdev, bar, res_name, IORESOURCE_EXCLUSIVE); +} /** * pci_release_selected_regions - Release selected PCI I/O and memory resources * @pdev: PCI device whose resources were previously reserved @@ -1444,20 +1487,14 @@ void pci_release_selected_regions(struct pci_dev *pdev, int bars) pci_release_region(pdev, i); } -/** - * pci_request_selected_regions - Reserve selected PCI I/O and memory resources - * @pdev: PCI device whose resources are to be reserved - * @bars: Bitmask of BARs to be requested - * @res_name: Name to be associated with resource - */ -int pci_request_selected_regions(struct pci_dev *pdev, int bars, - const char *res_name) +int __pci_request_selected_regions(struct pci_dev *pdev, int bars, + const char *res_name, int excl) { int i; for (i = 0; i < 6; i++) if (bars & (1 << i)) - if(pci_request_region(pdev, i, res_name)) + if (__pci_request_region(pdev, i, res_name, excl)) goto err_out; return 0; @@ -1469,6 +1506,26 @@ err_out: return -EBUSY; } + +/** + * pci_request_selected_regions - Reserve selected PCI I/O and memory resources + * @pdev: PCI device whose resources are to be reserved + * @bars: Bitmask of BARs to be requested + * @res_name: Name to be associated with resource + */ +int pci_request_selected_regions(struct pci_dev *pdev, int bars, + const char *res_name) +{ + return __pci_request_selected_regions(pdev, bars, res_name, 0); +} + +int pci_request_selected_regions_exclusive(struct pci_dev *pdev, + int bars, const char *res_name) +{ + return __pci_request_selected_regions(pdev, bars, res_name, + IORESOURCE_EXCLUSIVE); +} + /** * pci_release_regions - Release reserved PCI I/O and memory resources * @pdev: PCI device whose resources were previously reserved by pci_request_regions @@ -1501,6 +1558,29 @@ int pci_request_regions(struct pci_dev *pdev, const char *res_name) return pci_request_selected_regions(pdev, ((1 << 6) - 1), res_name); } +/** + * pci_request_regions_exclusive - Reserved PCI I/O and memory resources + * @pdev: PCI device whose resources are to be reserved + * @res_name: Name to be associated with resource. + * + * Mark all PCI regions associated with PCI device @pdev as + * being reserved by owner @res_name. Do not access any + * address inside the PCI regions unless this call returns + * successfully. + * + * pci_request_regions_exclusive() will mark the region so that + * /dev/mem and the sysfs MMIO access will not be allowed. + * + * Returns 0 on success, or %EBUSY on error. A warning + * message is also printed on failure. + */ +int pci_request_regions_exclusive(struct pci_dev *pdev, const char *res_name) +{ + return pci_request_selected_regions_exclusive(pdev, + ((1 << 6) - 1), res_name); +} + + /** * pci_set_master - enables bus-mastering for device dev * @dev: the PCI device to enable @@ -2149,10 +2229,13 @@ EXPORT_SYMBOL(pci_find_capability); EXPORT_SYMBOL(pci_bus_find_capability); EXPORT_SYMBOL(pci_release_regions); EXPORT_SYMBOL(pci_request_regions); +EXPORT_SYMBOL(pci_request_regions_exclusive); EXPORT_SYMBOL(pci_release_region); EXPORT_SYMBOL(pci_request_region); +EXPORT_SYMBOL(pci_request_region_exclusive); EXPORT_SYMBOL(pci_release_selected_regions); EXPORT_SYMBOL(pci_request_selected_regions); +EXPORT_SYMBOL(pci_request_selected_regions_exclusive); EXPORT_SYMBOL(pci_set_master); EXPORT_SYMBOL(pci_set_mwi); EXPORT_SYMBOL(pci_try_set_mwi); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 041e95aac2bf..f6bb2ca8e3ba 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -49,6 +49,7 @@ struct resource_list { #define IORESOURCE_SIZEALIGN 0x00020000 /* size indicates alignment */ #define IORESOURCE_STARTALIGN 0x00040000 /* start field is alignment */ +#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 #define IORESOURCE_UNSET 0x20000000 #define IORESOURCE_AUTO 0x40000000 @@ -133,13 +134,16 @@ static inline unsigned long resource_type(struct resource *res) } /* Convenience shorthand with allocation */ -#define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name)) -#define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name)) +#define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) +#define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl) +#define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0) +#define request_mem_region_exclusive(start,n,name) \ + __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_EXCLUSIVE) #define rename_region(region, newname) do { (region)->name = (newname); } while (0) extern struct resource * __request_region(struct resource *, resource_size_t start, - resource_size_t n, const char *name); + resource_size_t n, const char *name, int relaxed); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) @@ -175,6 +179,7 @@ extern struct resource * __devm_request_region(struct device *dev, extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); +extern int iomem_is_exclusive(u64 addr); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 59a3dc2059d3..bfcb39ca8879 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -686,10 +686,13 @@ void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *), int (*)(struct pci_dev *, u8, u8)); #define HAVE_PCI_REQ_REGIONS 2 int __must_check pci_request_regions(struct pci_dev *, const char *); +int __must_check pci_request_regions_exclusive(struct pci_dev *, const char *); void pci_release_regions(struct pci_dev *); int __must_check pci_request_region(struct pci_dev *, int, const char *); +int __must_check pci_request_region_exclusive(struct pci_dev *, int, const char *); void pci_release_region(struct pci_dev *, int); int pci_request_selected_regions(struct pci_dev *, int, const char *); +int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); /* drivers/pci/bus.c */ diff --git a/kernel/resource.c b/kernel/resource.c index e633106b12f6..ca6a1536b205 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res) */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, - const char *name) + const char *name, int flags) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent, res->start = start; res->end = start + n - 1; res->flags = IORESOURCE_BUSY; + res->flags |= flags; write_lock(&resource_lock); @@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start, { struct resource * res; - res = __request_region(parent, start, n, "check-region"); + res = __request_region(parent, start, n, "check-region", 0); if (!res) return -EBUSY; @@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev, dr->start = start; dr->n = n; - res = __request_region(parent, start, n, name); + res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else @@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) return err; } + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); -- cgit v1.2.3 From a0e280e0f33f6c859a235fb69a875ed8f3420388 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 7 Jan 2009 16:19:46 +0100 Subject: stop_machine/cpu hotplug: fix disable_nonboot_cpus disable_nonboot_cpus calls _cpu_down. But _cpu_down requires that the caller already created the stop_machine workqueue (like cpu_down does). Otherwise a call to stop_machine will lead to accesses to random memory regions. When introducing this new interface (9ea09af3bd3090e8349ca2899ca2011bd94cda85 "stop_machine: introduce stop_machine_create/destroy") I missed the second call site of _cpu_down. So add the missing stop_machine_create/destroy calls to disable_nonboot_cpus as well. Fixes suspend-to-ram/disk and also this bug: [ 286.547348] BUG: unable to handle kernel paging request at 6b6b6b6b [ 286.548940] IP: [] __stop_machine+0x88/0xe3 [ 286.550598] Oops: 0002 [#1] SMP [ 286.560580] Pid: 3273, comm: halt Not tainted (2.6.28-06127-g238c6d5 [ 286.560580] EIP: is at __stop_machine+0x88/0xe3 [ 286.560580] Process halt (pid: 3273, ti=f1a28000 task=f4530f30 [ 286.560580] Call Trace: [ 286.560580] [] ? _cpu_down+0x10f/0x234 [ 286.560580] [] ? disable_nonboot_cpus+0x58/0xdc [ 286.560580] [] ? kernel_poweroff+0x22/0x39 [ 286.560580] [] ? sys_reboot+0xde/0x14c [ 286.560580] [] ? complete_signal+0x179/0x191 [ 286.560580] [] ? send_signal+0x1cc/0x1e1 [ 286.560580] [] ? _spin_unlock_irqrestore+0x2d/0x3c [ 286.560580] [] ? group_send_signal_info+0x58/0x61 [ 286.560580] [] ? kill_pid_info+0x30/0x3a [ 286.560580] [] ? sys_kill+0x75/0x13a [ 286.560580] [] ? mntput_no_expire+ox1f/0x101 [ 286.560580] [] ? dput+0x1e/0x105 [ 286.560580] [] ? __fput+0x150/0x158 [ 286.560580] [] ? audit_syscall_entry+0x137/0x159 [ 286.560580] [] ? sysenter_do_call+0x12/0x34 Reported-and-tested-by: "Justin P. Mattock" Reviewed-by: Pekka Enberg Signed-off-by: Heiko Carstens Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/cpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 30e74dd6d01b..79e40f00dcb8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -379,8 +379,11 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error = 0; + int cpu, first_cpu, error; + error = stop_machine_create(); + if (error) + return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races @@ -409,6 +412,7 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); + stop_machine_destroy(); return error; } -- cgit v1.2.3 From 465634adc1d09b490c8ee31885575be39d375d53 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 7 Jan 2009 15:32:11 +0100 Subject: ring_buffer: fix ring_buffer_event_length() Function ring_buffer_event_length() provides an interface to detect the length of data stored in an entry. However, the length contains offsets depending on the internal usage. This makes it unusable. This patch fixes this and now ring_buffer_event_length() returns the alligned length that has been used in ring_buffer_lock_reserve(). Cc: Steven Rostedt Signed-off-by: Robert Richter --- kernel/trace/ring_buffer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 30d57dd01a85..d42b882dfe4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -117,7 +117,13 @@ rb_event_length(struct ring_buffer_event *event) */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - return rb_event_length(event); + unsigned length = rb_event_length(event); + if (event->type != RINGBUF_TYPE_DATA) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); -- cgit v1.2.3 From c9d557c19f94df42db78d4a5de4d25feee694bad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Jan 2009 14:33:30 -0800 Subject: rcu: fix bug in rcutorture system-shutdown code This patch fixes an rcutorture bug found by Eric Sesterhenn that resulted in oopses in response to "rmmod rcutorture". The problem was in some new code that attempted to handle the case where a system is shut down while rcutorture is still running, for example, when rcutorture is built into the kernel so that it cannot be removed. The fix causes the rcutorture threads to "park" in an schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT) rather than trying to get them to terminate cleanly. Concurrent shutdown and rmmod is illegal. I believe that this is 2.6.29 material, as it is used in some testing setups. For reference, here are the rcutorture operating modes: CONFIG_RCU_TORTURE_TEST=m This is the normal rcutorture build. Use "modprobe rcutorture" (with optional arguments) to start, and "rmmod rcutorture" to stop. If you shut the system down without doing the rmmod, you should see console output like: rcutorture thread rcu_torture_writer parking due to system shutdown One for each rcutorture kthread. CONFIG_RCU_TORTURE_TEST=y CONFIG_RCU_TORTURE_TEST_RUNNABLE=n Use this if you want rcutorture built in, but don't want the test to start running during early boot. To start the torturing: echo 1 > /proc/sys/kernel/rcutorture_runnable To stop the torturing, s/1/0/ You will get "parking" console messages as noted above when you shut the system down. CONFIG_RCU_TORTURE_TEST=y CONFIG_RCU_TORTURE_TEST_RUNNABLE=y Same as above, except that the torturing starts during early boot. Only for the stout of heart and strong of stomach. The same /proc entry noted above may be used to control the test. Located-by: Eric Sesterhenn Tested-by: Eric Sesterhenn Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 113 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 1cff28db56b6..7c4142a79f0a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,28 +136,46 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ -#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ -static int fullstop; /* stop generating callbacks at test end. */ -DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ - /* spawning of kthreads. */ +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ + /* of kthreads. */ /* - * Detect and respond to a signal-based shutdown. + * Detect and respond to a system shutdown. */ static int rcutorture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { - if (fullstop) - return NOTIFY_DONE; mutex_lock(&fullstop_mutex); - if (!fullstop) + if (fullstop == FULLSTOP_DONTSTOP) fullstop = FULLSTOP_SHUTDOWN; + else + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + printk(KERN_NOTICE + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + /* * Allocate an element from the rcu_tortures pool. */ @@ -219,13 +237,14 @@ rcu_random(struct rcu_random_state *rrsp) } static void -rcu_stutter_wait(void) +rcu_stutter_wait(char *title) { - while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) { + while (stutter_pause_test || !rcutorture_runnable) { if (rcutorture_runnable) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); } } @@ -287,7 +306,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop) { + if (fullstop != FULLSTOP_DONTSTOP) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -619,10 +638,11 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -643,11 +663,12 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -752,12 +773,13 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait(); - } while (!kthread_should_stop() && !fullstop); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) + while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; } @@ -854,7 +876,8 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); return 0; } @@ -866,52 +889,49 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_var_t tmp_mask; + cpumask_t tmp_mask; int i; - if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) - BUG(); - - cpumask_setall(tmp_mask); + cpus_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) - goto out; + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, tmp_mask); + cpu_clear(rcu_idle_cpu, tmp_mask); - set_cpus_allowed_ptr(current, tmp_mask); + set_cpus_allowed_ptr(current, &tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed_ptr(reader_tasks[i], - tmp_mask); + &tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], - tmp_mask); + &tmp_mask); } if (writer_task) - set_cpus_allowed_ptr(writer_task, tmp_mask); + set_cpus_allowed_ptr(writer_task, &tmp_mask); if (stats_task) - set_cpus_allowed_ptr(stats_task, tmp_mask); + set_cpus_allowed_ptr(stats_task, &tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; else rcu_idle_cpu--; -out: put_online_cpus(); - free_cpumask_var(tmp_mask); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the @@ -925,7 +945,8 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); return 0; } @@ -940,10 +961,11 @@ rcu_torture_stutter(void *arg) do { schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 1; - if (!kthread_should_stop() && !fullstop) + if (!kthread_should_stop()) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - } while (!kthread_should_stop() && !fullstop); + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); return 0; } @@ -970,15 +992,16 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); - if (!fullstop) { - /* If being signaled, let it happen, then exit. */ + if (fullstop == FULLSTOP_SHUTDOWN) { + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); - schedule_timeout_interruptible(10 * HZ); + schedule_timeout_uninterruptible(10); if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_CLEANUP; + fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_nb); if (stutter_task) { @@ -1078,7 +1101,7 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms("Start of test"); - fullstop = 0; + fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ -- cgit v1.2.3 From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make VMAs per MM as for MMU-mode linux Make VMAs per mm_struct as for MMU-mode linux. This solves two problems: (1) In SYSV SHM where nattch for a segment does not reflect the number of shmat's (and forks) done. (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an exec'ing process when VM_EXECUTABLE is specified, regardless of the fact that a VMA might be shared and already have its vm_mm assigned to another process or a dead process. A new struct (vm_region) is introduced to track a mapped region and to remember the circumstances under which it may be shared and the vm_list_struct structure is discarded as it's no longer required. This patch makes the following additional changes: (1) Regions are now allocated with alloc_pages() rather than kmalloc() and with no recourse to __GFP_COMP, so the pages are not composite. Instead, each page has a reference on it held by the region. Anything else that is interested in such a page will have to get a reference on it to retain it. When the pages are released due to unmapping, each page is passed to put_page() and will be freed when the page usage count reaches zero. (2) Excess pages are trimmed after an allocation as the allocation must be made as a power-of-2 quantity of pages. (3) VMAs are added to the parent MM's R/B tree and mmap lists. As an MM may end up with overlapping VMAs within the tree, the VMA struct address is appended to the sort key. (4) Non-anonymous VMAs are now added to the backing inode's prio list. (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of the backing region. The VMA and region structs will be split if necessary. (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory segment instead of all the attachments at that addresss. Multiple shmat()'s return the same address under NOMMU-mode instead of different virtual addresses as under MMU-mode. (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode. (8) /proc/maps is now the global list of mapped regions, and may list bits that aren't actually mapped anywhere. (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount of RAM currently allocated by mmap to hold mappable regions that can't be mapped directly. These are copies of the backing device or file if not anonymous. These changes make NOMMU mode more similar to MMU mode. The downside is that NOMMU mode requires some extra memory to track things over NOMMU without this patch (VMAs are no longer shared, and there are now region structs). Signed-off-by: David Howells Tested-by: Mike Frysinger Acked-by: Paul Mundt --- Documentation/nommu-mmap.txt | 18 +- arch/arm/include/asm/mmu.h | 1 - arch/blackfin/include/asm/mmu.h | 1 - arch/blackfin/kernel/ptrace.c | 6 +- arch/blackfin/kernel/traps.c | 11 +- arch/frv/kernel/ptrace.c | 11 +- arch/h8300/include/asm/mmu.h | 1 - arch/m68knommu/include/asm/mmu.h | 1 - arch/sh/include/asm/mmu.h | 1 - fs/binfmt_elf_fdpic.c | 27 +- fs/proc/internal.h | 2 - fs/proc/meminfo.c | 6 + fs/proc/nommu.c | 71 ++- fs/proc/task_nommu.c | 108 +++-- include/asm-frv/mmu.h | 1 - include/asm-m32r/mmu.h | 1 - include/linux/mm.h | 18 +- include/linux/mm_types.h | 18 +- ipc/shm.c | 12 + kernel/fork.c | 4 +- lib/Kconfig.debug | 7 + mm/mmap.c | 10 + mm/nommu.c | 960 +++++++++++++++++++++++++++------------ 23 files changed, 860 insertions(+), 436 deletions(-) (limited to 'kernel') diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 7714f57caad5..02b89dcf38ac 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -109,12 +109,18 @@ and it's also much more restricted in the latter case: FURTHER NOTES ON NO-MMU MMAP ============================ - (*) A request for a private mapping of less than a page in size may not return - a page-aligned buffer. This is because the kernel calls kmalloc() to - allocate the buffer, not get_free_page(). - - (*) A list of all the mappings on the system is visible through /proc/maps in - no-MMU mode. + (*) A request for a private mapping of a file may return a buffer that is not + page-aligned. This is because XIP may take place, and the data may not be + paged aligned in the backing store. + + (*) A request for an anonymous mapping will always be page aligned. If + possible the size of the request should be a power of two otherwise some + of the space may be wasted as the kernel must allocate a power-of-2 + granule but will only discard the excess if appropriately configured as + this has an effect on fragmentation. + + (*) A list of all the private copy and anonymous mappings on the system is + visible through /proc/maps in no-MMU mode. (*) A list of all the mappings in use by a process is visible through /proc//maps in no-MMU mode. diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 53099d4ee421..b561584d04a1 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -24,7 +24,6 @@ typedef struct { * modified for 2.6 by Hyok S. Choi */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/blackfin/include/asm/mmu.h b/arch/blackfin/include/asm/mmu.h index 757e43906ed4..dbfd686360e6 100644 --- a/arch/blackfin/include/asm/mmu.h +++ b/arch/blackfin/include/asm/mmu.h @@ -10,7 +10,6 @@ struct sram_list_struct { }; typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; unsigned long stack_start; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index d2d388536630..594e325b40e4 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -160,15 +160,15 @@ put_reg(struct task_struct *task, int regno, unsigned long data) static inline int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) { - struct vm_list_struct *vml; + struct vm_area_struct *vma; struct sram_list_struct *sraml; /* overflow */ if (start + len < start) return -EIO; - for (vml = child->mm->context.vmlist; vml; vml = vml->next) - if (start >= vml->vma->vm_start && start + len < vml->vma->vm_end) + vma = find_vma(child->mm, start); + if (vma && start >= vma->vm_start && start + len <= vma->vm_end) return 0; for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) diff --git a/arch/blackfin/kernel/traps.c b/arch/blackfin/kernel/traps.c index 17d8e4172896..5b0667da8d05 100644 --- a/arch/blackfin/kernel/traps.c +++ b/arch/blackfin/kernel/traps.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ static void decode_address(char *buf, unsigned long address) struct mm_struct *mm; unsigned long flags, offset; unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic(); + struct rb_node *n; #ifdef CONFIG_KALLSYMS unsigned long symsize; @@ -128,9 +130,10 @@ static void decode_address(char *buf, unsigned long address) if (!mm) continue; - vml = mm->context.vmlist; - while (vml) { - struct vm_area_struct *vma = vml->vma; + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + struct vm_area_struct *vma; + + vma = rb_entry(n, struct vm_area_struct, vm_rb); if (address >= vma->vm_start && address < vma->vm_end) { char _tmpbuf[256]; @@ -176,8 +179,6 @@ static void decode_address(char *buf, unsigned long address) goto done; } - - vml = vml->next; } if (!in_atomic) mmput(mm); diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c index 709e9bdc6126..5e7d401d21e7 100644 --- a/arch/frv/kernel/ptrace.c +++ b/arch/frv/kernel/ptrace.c @@ -69,7 +69,8 @@ static inline int put_reg(struct task_struct *task, int regno, } /* - * check that an address falls within the bounds of the target process's memory mappings + * check that an address falls within the bounds of the target process's memory + * mappings */ static inline int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) @@ -79,11 +80,11 @@ static inline int is_user_addr_valid(struct task_struct *child, return -EIO; return 0; #else - struct vm_list_struct *vml; + struct vm_area_struct *vma; - for (vml = child->mm->context.vmlist; vml; vml = vml->next) - if (start >= vml->vma->vm_start && start + len <= vml->vma->vm_end) - return 0; + vma = find_vma(child->mm, start); + if (vma && start >= vma->vm_start && start + len <= vma->vm_end) + return 0; return -EIO; #endif diff --git a/arch/h8300/include/asm/mmu.h b/arch/h8300/include/asm/mmu.h index 2ce06ea46104..31309969df70 100644 --- a/arch/h8300/include/asm/mmu.h +++ b/arch/h8300/include/asm/mmu.h @@ -4,7 +4,6 @@ /* Copyright (C) 2002, David McCullough */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/m68knommu/include/asm/mmu.h b/arch/m68knommu/include/asm/mmu.h index 5fa6b68353ba..e2da1e6f09fe 100644 --- a/arch/m68knommu/include/asm/mmu.h +++ b/arch/m68knommu/include/asm/mmu.h @@ -4,7 +4,6 @@ /* Copyright (C) 2002, David McCullough */ typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h index fdcb93bc6d11..6c43625bb1a5 100644 --- a/arch/sh/include/asm/mmu.h +++ b/arch/sh/include/asm/mmu.h @@ -9,7 +9,6 @@ typedef struct { mm_context_id_t id; void *vdso; #else - struct vm_list_struct *vmlist; unsigned long end_brk; #endif #ifdef CONFIG_BINFMT_ELF_FDPIC diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index aa5b43205e37..22baf1b13493 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1567,11 +1567,9 @@ end_coredump: static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { - struct vm_list_struct *vml; - - for (vml = current->mm->context.vmlist; vml; vml = vml->next) { - struct vm_area_struct *vma = vml->vma; + struct vm_area_struct *vma; + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { if (!maydump(vma, mm_flags)) continue; @@ -1617,9 +1615,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, elf_fpxregset_t *xfpu = NULL; #endif int thread_status_size = 0; -#ifndef CONFIG_MMU - struct vm_list_struct *vml; -#endif elf_addr_t *auxv; unsigned long mm_flags; @@ -1685,13 +1680,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, fill_prstatus(prstatus, current, signr); elf_core_copy_regs(&prstatus->pr_reg, regs); -#ifdef CONFIG_MMU segs = current->mm->map_count; -#else - segs = 0; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - segs++; -#endif #ifdef ELF_CORE_EXTRA_PHDRS segs += ELF_CORE_EXTRA_PHDRS; #endif @@ -1766,20 +1755,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, mm_flags = current->mm->flags; /* write program headers for segments dump */ - for ( -#ifdef CONFIG_MMU - vma = current->mm->mmap; vma; vma = vma->vm_next -#else - vml = current->mm->context.vmlist; vml; vml = vml->next -#endif - ) { + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { struct elf_phdr phdr; size_t sz; -#ifndef CONFIG_MMU - vma = vml->vma; -#endif - sz = vma->vm_end - vma->vm_start; phdr.p_type = PT_LOAD; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e8aeb8b61ce..cd53ff838498 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -41,8 +41,6 @@ do { \ (vmi)->used = 0; \ (vmi)->largest_chunk = 0; \ } while(0) - -extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b1675c4e66da..43d23948384a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -73,6 +73,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" +#endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" #endif "SwapTotal: %8lu kB\n" "SwapFree: %8lu kB\n" @@ -115,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freehigh), K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), +#endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 3f87d2632947..b446d7ad0b0d 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -33,33 +33,33 @@ #include "internal.h" /* - * display a single VMA to a sequenced file + * display a single region to a sequenced file */ -int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_region_show(struct seq_file *m, struct vm_region *region) { unsigned long ino = 0; struct file *file; dev_t dev = 0; int flags, len; - flags = vma->vm_flags; - file = vma->vm_file; + flags = region->vm_flags; + file = region->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = region->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, - vma->vm_end, + region->vm_start, + region->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { @@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } /* - * display a list of all the VMAs the kernel knows about + * display a list of all the REGIONs the kernel knows about * - nommu kernals have a single flat list */ -static int nommu_vma_list_show(struct seq_file *m, void *v) +static int nommu_region_list_show(struct seq_file *m, void *_p) { - struct vm_area_struct *vma; + struct rb_node *p = _p; - vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); - return nommu_vma_show(m, vma); + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); } -static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) { - struct rb_node *_rb; + struct rb_node *p; loff_t pos = *_pos; - void *next = NULL; - down_read(&nommu_vma_sem); + down_read(&nommu_region_sem); - for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { - if (pos == 0) { - next = _rb; - break; - } - pos--; - } - - return next; + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; } -static void nommu_vma_list_stop(struct seq_file *m, void *v) +static void nommu_region_list_stop(struct seq_file *m, void *v) { - up_read(&nommu_vma_sem); + up_read(&nommu_region_sem); } -static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rb_next((struct rb_node *) v); } -static const struct seq_operations proc_nommu_vma_list_seqop = { - .start = nommu_vma_list_start, - .next = nommu_vma_list_next, - .stop = nommu_vma_list_stop, - .show = nommu_vma_list_show +static struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show }; -static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) { - return seq_open(file, &proc_nommu_vma_list_seqop); + return seq_open(file, &proc_nommu_region_list_seqop); } -static const struct file_operations proc_nommu_vma_list_operations = { - .open = proc_nommu_vma_list_open, +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); return 0; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index d4a8be32b902..ca4a48d0d311 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -15,25 +15,25 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { - struct vm_list_struct *vml; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0; down_read(&mm->mmap_sem); - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (!vml->vma) - continue; + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); - bytes += kobjsize(vml); + bytes += kobjsize(vma); if (atomic_read(&mm->mm_count) > 1 || - atomic_read(&vml->vma->vm_usage) > 1 - ) { - sbytes += kobjsize((void *) vml->vma->vm_start); - sbytes += kobjsize(vml->vma); + vma->vm_region || + vma->vm_flags & VM_MAYSHARE) { + sbytes += kobjsize((void *) vma->vm_start); + if (vma->vm_region) + sbytes += kobjsize(vma->vm_region); } else { - bytes += kobjsize((void *) vml->vma->vm_start); - bytes += kobjsize(vml->vma); - slack += kobjsize((void *) vml->vma->vm_start) - - (vml->vma->vm_end - vml->vma->vm_start); + bytes += kobjsize((void *) vma->vm_start); + slack += kobjsize((void *) vma->vm_start) - + (vma->vm_end - vma->vm_start); } } @@ -70,13 +70,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long vsize = 0; down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - if (tbp->vma) - vsize += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_region->vm_end - vma->vm_region->vm_start; } up_read(&mm->mmap_sem); return vsize; @@ -85,16 +86,15 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; int size = kobjsize(mm); down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - size += kobjsize(tbp); - if (tbp->vma) { - size += kobjsize(tbp->vma); - size += kobjsize((void *) tbp->vma->vm_start); - } + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + size += kobjsize((void *) vma->vm_start); } size += (*text = mm->end_code - mm->start_code); @@ -104,21 +104,63 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, return size; } +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + vma->vm_pgoff << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_vml) +static int show_map(struct seq_file *m, void *_p) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; - return nommu_vma_show(m, vml->vma); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_list_struct *vml; struct mm_struct *mm; + struct rb_node *p; loff_t n = *pos; /* pin the task and mm whilst we play with them */ @@ -134,9 +176,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) } /* start from the Nth VMA */ - for (vml = mm->context.vmlist; vml; vml = vml->next) + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) - return vml; + return p; return NULL; } @@ -152,12 +194,12 @@ static void m_stop(struct seq_file *m, void *_vml) } } -static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; (*pos)++; - return vml ? vml->next : NULL; + return p ? rb_next(p) : NULL; } static const struct seq_operations proc_pid_maps_ops = { diff --git a/include/asm-frv/mmu.h b/include/asm-frv/mmu.h index 22c03714fb14..86ca0e86e7d2 100644 --- a/include/asm-frv/mmu.h +++ b/include/asm-frv/mmu.h @@ -22,7 +22,6 @@ typedef struct { unsigned long dtlb_ptd_mapping; /* [DAMR5] PTD mapping for dtlb cached PGE */ #else - struct vm_list_struct *vmlist; unsigned long end_brk; #endif diff --git a/include/asm-m32r/mmu.h b/include/asm-m32r/mmu.h index d9bd724479cf..150cb92bb666 100644 --- a/include/asm-m32r/mmu.h +++ b/include/asm-m32r/mmu.h @@ -4,7 +4,6 @@ #if !defined(CONFIG_MMU) typedef struct { - struct vm_list_struct *vmlist; unsigned long end_brk; } mm_context_t; diff --git a/include/linux/mm.h b/include/linux/mm.h index 4a3d28c86443..b91a73fd1bcc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -56,19 +56,9 @@ extern unsigned long mmap_min_addr; extern struct kmem_cache *vm_area_cachep; -/* - * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is - * disabled, then there's a single shared list of VMAs maintained by the - * system, and mm's subscribe to these individually - */ -struct vm_list_struct { - struct vm_list_struct *next; - struct vm_area_struct *vma; -}; - #ifndef CONFIG_MMU -extern struct rb_root nommu_vma_tree; -extern struct rw_semaphore nommu_vma_sem; +extern struct rb_root nommu_region_tree; +extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif @@ -1061,6 +1051,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_pages_min(void); extern void mem_init(void); +extern void __init mmap_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); @@ -1072,6 +1063,9 @@ extern void setup_per_cpu_pageset(void); static inline void setup_per_cpu_pageset(void) {} #endif +/* nommu.c */ +extern atomic_t mmap_pages_allocated; + /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9cfc9b627fdd..1c1e0d3a1714 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,22 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +/* + * A region containing a mapping of a non-memory backed file under NOMMU + * conditions. These are held in a global tree and are pinned by the VMAs that + * map parts of them. + */ +struct vm_region { + struct rb_node vm_rb; /* link in global region tree */ + unsigned long vm_flags; /* VMA vm_flags */ + unsigned long vm_start; /* start address of region */ + unsigned long vm_end; /* region initialised to here */ + unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ + struct file *vm_file; /* the backing file or NULL */ + + atomic_t vm_usage; /* region usage count */ +}; + /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -152,7 +168,7 @@ struct vm_area_struct { unsigned long vm_truncate_count;/* truncate_count or restart_addr */ #ifndef CONFIG_MMU - atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ + struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ diff --git a/ipc/shm.c b/ipc/shm.c index b125b560240e..d0ab5527bf45 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -990,6 +990,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr) */ vma = find_vma(mm, addr); +#ifdef CONFIG_MMU while (vma) { next = vma->vm_next; @@ -1034,6 +1035,17 @@ asmlinkage long sys_shmdt(char __user *shmaddr) vma = next; } +#else /* CONFIG_MMU */ + /* under NOMMU conditions, the exact address to be destroyed must be + * given */ + retval = -EINVAL; + if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + retval = 0; + } + +#endif + up_write(&mm->mmap_sem); return retval; } diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3d..0bce4a43bb37 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1481,12 +1481,10 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + mmap_init(); } /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e75478e9c69..d0a32aab03ff 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -512,6 +512,13 @@ config DEBUG_VIRTUAL If unsure, say N. +config DEBUG_NOMMU_REGIONS + bool "Debug the global anon/private NOMMU mapping region tree" + depends on DEBUG_KERNEL && !MMU + help + This option causes the global tree of anonymous and private mapping + regions to be regularly checked for invalid topology. + config DEBUG_WRITECOUNT bool "Debug filesystem writers count" depends on DEBUG_KERNEL diff --git a/mm/mmap.c b/mm/mmap.c index a910c045cfd4..749623196cb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2472,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm) mutex_unlock(&mm_all_locks_mutex); } + +/* + * initialise the VMA slab + */ +void __init mmap_init(void) +{ + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); +} diff --git a/mm/nommu.c b/mm/nommu.c index 23f355bbe262..0d363dfcf10e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -6,7 +6,7 @@ * * See Documentation/nommu-mmap.txt * - * Copyright (c) 2004-2005 David Howells + * Copyright (c) 2004-2008 David Howells * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer @@ -33,6 +33,28 @@ #include #include #include +#include "internal.h" + +static inline __attribute__((format(printf, 1, 2))) +void no_printk(const char *fmt, ...) +{ +} + +#if 0 +#define kenter(FMT, ...) \ + printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) +#else +#define kenter(FMT, ...) \ + no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) +#endif #include "internal.h" @@ -46,12 +68,15 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int heap_stack_gap = 0; +atomic_t mmap_pages_allocated; + EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); -/* list of shareable VMAs */ -struct rb_root nommu_vma_tree = RB_ROOT; -DECLARE_RWSEM(nommu_vma_sem); +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); struct vm_operations_struct generic_file_vm_ops = { }; @@ -400,129 +425,174 @@ asmlinkage unsigned long sys_brk(unsigned long brk) return mm->brk = brk; } -#ifdef DEBUG -static void show_process_blocks(void) +/* + * initialise the VMA and region record slabs + */ +void __init mmap_init(void) { - struct vm_list_struct *vml; - - printk("Process blocks %d:", current->pid); - - for (vml = ¤t->mm->context.vmlist; vml; vml = vml->next) { - printk(" %p: %p", vml, vml->vma); - if (vml->vma) - printk(" (%d @%lx #%d)", - kobjsize((void *) vml->vma->vm_start), - vml->vma->vm_start, - atomic_read(&vml->vma->vm_usage)); - printk(vml->next ? " ->" : ".\n"); - } + vm_region_jar = kmem_cache_create("vm_region_jar", + sizeof(struct vm_region), 0, + SLAB_PANIC, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL); } -#endif /* DEBUG */ /* - * add a VMA into a process's mm_struct in the appropriate place in the list - * - should be called with mm->mmap_sem held writelocked + * validate the region tree + * - the caller must hold the region lock */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) { - struct vm_list_struct **ppv; + struct vm_region *region, *last; + struct rb_node *p, *lastp; - for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) - if ((*ppv)->vma->vm_start > vml->vma->vm_start) - break; + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + if (unlikely(last->vm_end <= last->vm_start)) + BUG(); + + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + if (unlikely(region->vm_end <= region->vm_start)) + BUG(); + if (unlikely(region->vm_start < last->vm_end)) + BUG(); - vml->next = *ppv; - *ppv = vml; + lastp = p; + } } +#else +#define validate_nommu_regions() do {} while(0) +#endif /* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked + * add a region into the global tree */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static void add_nommu_region(struct vm_region *region) { - struct vm_list_struct *loop, *vml; + struct vm_region *pregion; + struct rb_node **p, *parent; - /* search the vm_start ordered list */ - vml = NULL; - for (loop = mm->context.vmlist; loop; loop = loop->next) { - if (loop->vma->vm_start > addr) - break; - vml = loop; + validate_nommu_regions(); + + BUG_ON(region->vm_start & ~PAGE_MASK); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); } - if (vml && vml->vma->vm_end > addr) - return vml->vma; + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); - return NULL; + validate_nommu_regions(); } -EXPORT_SYMBOL(find_vma); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions + * delete a region from the global tree */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +static void delete_nommu_region(struct vm_region *region) { - return find_vma(mm, addr); -} + BUG_ON(!nommu_region_tree.rb_node); -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return -ENOMEM; + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); } /* - * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * free a contiguous series of pages */ -static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, - unsigned long addr) +static void free_page_series(unsigned long from, unsigned long to) { - struct vm_list_struct *vml; - - /* search the vm_start ordered list */ - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (vml->vma->vm_start == addr) - return vml->vma; - if (vml->vma->vm_start > addr) - break; + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page(from); + + kdebug("- free %lx", from); + atomic_dec(&mmap_pages_allocated); + if (page_count(page) != 1) + kdebug("free page %p [%d]", page, page_count(page)); + put_page(page); } - - return NULL; } /* - * find a VMA in the global tree + * release a reference to a region + * - the caller must hold the region semaphore, which this releases + * - the region may not have been added to the tree yet, in which case vm_end + * will equal vm_start */ -static inline struct vm_area_struct *find_nommu_vma(unsigned long start) +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) { - struct vm_area_struct *vma; - struct rb_node *n = nommu_vma_tree.rb_node; + kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); - while (n) { - vma = rb_entry(n, struct vm_area_struct, vm_rb); + BUG_ON(!nommu_region_tree.rb_node); - if (start < vma->vm_start) - n = n->rb_left; - else if (start > vma->vm_start) - n = n->rb_right; - else - return vma; + if (atomic_dec_and_test(®ion->vm_usage)) { + if (region->vm_end > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) { + kdebug("free series"); + free_page_series(region->vm_start, region->vm_end); + } + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); } +} - return NULL; +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); } /* - * add a VMA in the global tree + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_sem held writelocked */ -static void add_nommu_vma(struct vm_area_struct *vma) +static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma; + struct vm_area_struct *pvma, **pp; struct address_space *mapping; - struct rb_node **p = &nommu_vma_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p, *parent; + + kenter(",%p", vma); + + BUG_ON(!vma->vm_region); + + mm->map_count++; + vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { @@ -533,42 +603,62 @@ static void add_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* add the VMA to the master list */ + /* add the VMA to the tree */ + parent = NULL; + p = &mm->mm_rb.rb_node; while (*p) { parent = *p; pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - if (vma->vm_start < pvma->vm_start) { + /* sort by: start addr, end addr, VMA struct addr in that order + * (the latter is necessary as we may get identical VMAs) */ + if (vma->vm_start < pvma->vm_start) p = &(*p)->rb_left; - } - else if (vma->vm_start > pvma->vm_start) { + else if (vma->vm_start > pvma->vm_start) p = &(*p)->rb_right; - } - else { - /* mappings are at the same address - this can only - * happen for shared-mem chardevs and shared file - * mappings backed by ramfs/tmpfs */ - BUG_ON(!(pvma->vm_flags & VM_SHARED)); - - if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) - p = &(*p)->rb_right; - else - BUG(); - } + else if (vma->vm_end < pvma->vm_end) + p = &(*p)->rb_left; + else if (vma->vm_end > pvma->vm_end) + p = &(*p)->rb_right; + else if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) + p = &(*p)->rb_right; + else + BUG(); } rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &nommu_vma_tree); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); + + /* add VMA to the VMA list also */ + for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { + if (pvma->vm_start > vma->vm_start) + break; + if (pvma->vm_start < vma->vm_start) + continue; + if (pvma->vm_end < vma->vm_end) + break; + } + + vma->vm_next = *pp; + *pp = vma; } /* - * delete a VMA from the global list + * delete a VMA from its owning mm_struct and address space */ -static void delete_nommu_vma(struct vm_area_struct *vma) +static void delete_vma_from_mm(struct vm_area_struct *vma) { + struct vm_area_struct **pp; struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; + + kenter("%p", vma); + + mm->map_count--; + if (mm->mmap_cache == vma) + mm->mmap_cache = NULL; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -579,8 +669,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); } - /* remove from the master list */ - rb_erase(&vma->vm_rb, &nommu_vma_tree); + /* remove from the MM's tree and list */ + rb_erase(&vma->vm_rb, &mm->mm_rb); + for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { + if (*pp == vma) { + *pp = vma->vm_next; + break; + } + } + + vma->vm_mm = NULL; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + kenter("%p", vma); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + } + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start <= addr && vma->vm_end > addr) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct rb_node *n = mm->mm_rb.rb_node; + unsigned long end = addr + len; + + /* check the cache first */ + vma = mm->mmap_cache; + if (vma && vma->vm_start == addr && vma->vm_end == end) + return vma; + + /* trawl the tree (there may be multiple mappings in which addr + * resides) */ + for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { + vma = rb_entry(n, struct vm_area_struct, vm_rb); + if (vma->vm_start < addr) + continue; + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { + mm->mmap_cache = vma; + return vma; + } + } + + return NULL; } /* @@ -595,7 +792,7 @@ static int validate_mmap_request(struct file *file, unsigned long pgoff, unsigned long *_capabilities) { - unsigned long capabilities; + unsigned long capabilities, rlen; unsigned long reqprot = prot; int ret; @@ -615,12 +812,12 @@ static int validate_mmap_request(struct file *file, return -EINVAL; /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) return -ENOMEM; /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; if (file) { @@ -794,9 +991,10 @@ static unsigned long determine_vm_flags(struct file *file, } /* - * set up a shared mapping on a file + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) */ -static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; @@ -814,10 +1012,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) /* * set up a private mapping or an anonymous shared mapping */ -static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len) { + struct page *pages; + unsigned long total, point, n, rlen; void *base; - int ret; + int ret, order; /* invoke the file's mapping function so that it can keep track of * shared mappings on devices or memory @@ -836,23 +1038,46 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * make a private copy of the data and map that instead */ } + rlen = PAGE_ALIGN(len); + /* allocate some memory to hold the mapping * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL|__GFP_COMP); - if (!base) + order = get_order(rlen); + kdebug("alloc order %d for %lx", order, len); + + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) goto enomem; - vma->vm_start = (unsigned long) base; - vma->vm_end = vma->vm_start + len; - vma->vm_flags |= VM_MAPPED_COPY; + /* we allocated a power-of-2 sized page set, so we need to trim off the + * excess */ + total = 1 << order; + atomic_add(total, &mmap_pages_allocated); + + point = rlen >> PAGE_SHIFT; + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } + + total = rlen >> PAGE_SHIFT; + for (point = 1; point < total; point++) + set_page_refcounted(&pages[point]); -#ifdef WARN_ON_SLACK - if (len + WARN_ON_SLACK <= kobjsize(result)) - printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", - len, current->pid, kobjsize(result) - len); -#endif + base = page_address(pages); + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + rlen; + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; if (vma->vm_file) { /* read the contents of a file into the copy */ @@ -864,26 +1089,27 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) old_fs = get_fs(); set_fs(KERNEL_DS); - ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); + ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); set_fs(old_fs); if (ret < 0) goto error_free; /* clear the last little bit */ - if (ret < len) - memset(base + ret, 0, len - ret); + if (ret < rlen) + memset(base + ret, 0, rlen - ret); } else { /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, len); + memset(base, 0, rlen); } return 0; error_free: - kfree(base); - vma->vm_start = 0; + free_page_series(region->vm_start, region->vm_end); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; return ret; enomem: @@ -903,13 +1129,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long flags, unsigned long pgoff) { - struct vm_list_struct *vml = NULL; - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; + struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags; - void *result; + unsigned long capabilities, vm_flags, result; int ret; + kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -917,73 +1144,120 @@ unsigned long do_mmap_pgoff(struct file *file, * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) + if (ret < 0) { + kleave(" = %d [val]", ret); return ret; + } /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); - /* we're going to need to record the mapping if it works */ - vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); - if (!vml) - goto error_getting_vml; + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + atomic_set(®ion->vm_usage, 1); + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; - down_write(&nommu_vma_sem); + INIT_LIST_HEAD(&vma->anon_vma_node); + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; - /* if we want to share, we need to check for VMAs created by other + if (file) { + region->vm_file = file; + get_file(file); + vma->vm_file = file; + get_file(file); + if (vm_flags & VM_EXECUTABLE) { + added_exe_file_vma(current->mm); + vma->vm_mm = current->mm; + } + } + + down_write(&nommu_region_sem); + + /* if we want to share, we need to check for regions created by other * mmap() calls that overlap with our proposed mapping - * - we can only share with an exact match on most regular files + * - we can only share with a superset match on most regular files * - shared mappings on character devices and memory backed files are * permitted to overlap inexactly as far as we are concerned for in * these cases, sharing is handled in the driver or filesystem rather * than here */ if (vm_flags & VM_MAYSHARE) { - unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vmpglen; + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; - /* suppress VMA sharing for shared regions */ - if (vm_flags & VM_SHARED && - capabilities & BDI_CAP_MAP_DIRECT) - goto dont_share_VMAs; + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; - for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { - vma = rb_entry(rb, struct vm_area_struct, vm_rb); + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (!(pregion->vm_flags & VM_MAYSHARE)) continue; /* search for overlapping mappings on the same file */ - if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) + if (pregion->vm_file->f_path.dentry->d_inode != + file->f_path.dentry->d_inode) continue; - if (vma->vm_pgoff >= pgoff + pglen) + if (pregion->vm_pgoff >= pgend) continue; - vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; - vmpglen >>= PAGE_SHIFT; - if (pgoff >= vma->vm_pgoff + vmpglen) + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) continue; - /* handle inexactly overlapping matches between mappings */ - if (vma->vm_pgoff != pgoff || vmpglen != pglen) { + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) goto sharing_violation; continue; } - /* we've found a VMA we can share */ - atomic_inc(&vma->vm_usage); - - vml->vma = vma; - result = (void *) vma->vm_start; - goto shared; + /* we've found a region we can share */ + atomic_inc(&pregion->vm_usage); + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) { + kdebug("share copy"); + vma->vm_flags |= VM_MAPPED_COPY; + } else { + kdebug("share mmap"); + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + atomic_dec(&pregion->vm_usage); + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; } - dont_share_VMAs: - vma = NULL; - /* obtain the address at which to make a shared mapping * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping @@ -994,102 +1268,93 @@ unsigned long do_mmap_pgoff(struct file *file, if (IS_ERR((void *) addr)) { ret = addr; if (ret != (unsigned long) -ENOSYS) - goto error; + goto error_just_free; /* the driver refused to tell us where to site * the mapping so we'll have to attempt to copy * it */ ret = (unsigned long) -ENODEV; if (!(capabilities & BDI_CAP_MAP_COPY)) - goto error; + goto error_just_free; capabilities &= ~BDI_CAP_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; } } } - /* we're going to need a VMA struct as well */ - vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); - if (!vma) - goto error_getting_vma; - - INIT_LIST_HEAD(&vma->anon_vma_node); - atomic_set(&vma->vm_usage, 1); - if (file) { - get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } - } - vma->vm_file = file; - vma->vm_flags = vm_flags; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - - vml->vma = vma; + vma->vm_region = region; /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) - ret = do_mmap_shared_file(vma, len); + ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, len); + ret = do_mmap_private(vma, region, len); if (ret < 0) - goto error; + goto error_put_region; + + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ - result = (void *) vma->vm_start; + result = vma->vm_start; current->mm->total_vm += len >> PAGE_SHIFT; - add_nommu_vma(vma); +share: + add_vma_to_mm(current->mm, vma); - shared: - add_vma_to_mm(current->mm, vml); - - up_write(&nommu_vma_sem); + up_write(&nommu_region_sem); if (prot & PROT_EXEC) - flush_icache_range((unsigned long) result, - (unsigned long) result + len); + flush_icache_range(result, result + len); -#ifdef DEBUG - printk("do_mmap:\n"); - show_process_blocks(); -#endif + kleave(" = %lx", result); + return result; - return (unsigned long) result; - - error: - up_write(&nommu_vma_sem); - kfree(vml); +error_put_region: + __put_nommu_region(region); if (vma) { if (vma->vm_file) { fput(vma->vm_file); if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(vma->vm_mm); } - kfree(vma); + kmem_cache_free(vm_area_cachep, vma); } + kleave(" = %d [pr]", ret); return ret; - sharing_violation: - up_write(&nommu_vma_sem); - printk("Attempt to share mismatched mappings\n"); - kfree(vml); - return -EINVAL; +error_just_free: + up_write(&nommu_region_sem); +error: + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + fput(vma->vm_file); + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(vma->vm_mm); + kmem_cache_free(vm_area_cachep, vma); + kleave(" = %d", ret); + return ret; + +sharing_violation: + up_write(&nommu_region_sem); + printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; - error_getting_vma: - up_write(&nommu_vma_sem); - kfree(vml); - printk("Allocation of vma for %lu byte allocation from process %d failed\n", +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + printk(KERN_WARNING "Allocation of vma for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; - error_getting_vml: - printk("Allocation of vml for %lu byte allocation from process %d failed\n", +error_getting_region: + printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" + " from process %d failed\n", len, current->pid); show_free_areas(); return -ENOMEM; @@ -1097,77 +1362,180 @@ unsigned long do_mmap_pgoff(struct file *file, EXPORT_SYMBOL(do_mmap_pgoff); /* - * handle mapping disposal for uClinux + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. */ -static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { - if (vma) { - down_write(&nommu_vma_sem); + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; - if (atomic_dec_and_test(&vma->vm_usage)) { - delete_nommu_vma(vma); + kenter(""); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + /* we're only permitted to split anonymous regions that have a single + * owner */ + if (vma->vm_file || + atomic_read(&vma->vm_region->vm_usage) != 1) + return -ENOMEM; - /* IO memory and memory shared directly out of the pagecache from - * ramfs/tmpfs mustn't be released here */ - if (vma->vm_flags & VM_MAPPED_COPY) - kfree((void *) vma->vm_start); + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } - kfree(vma); - } + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) { + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; + } + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; + + if (new_below) { + region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; + } - up_write(&nommu_vma_sem); + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + delete_vma_from_mm(vma); + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + add_vma_to_mm(mm, vma); + add_vma_to_mm(mm, new); + return 0; } /* - * release a mapping - * - under NOMMU conditions the parameters must match exactly to the mapping to - * be removed + * shrink a VMA by removing the specified chunk from either the beginning or + * the end */ -int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) { - struct vm_list_struct *vml, **parent; - unsigned long end = addr + len; + struct vm_region *region; -#ifdef DEBUG - printk("do_munmap:\n"); -#endif + kenter(""); - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { - if ((*parent)->vma->vm_start > addr) - break; - if ((*parent)->vma->vm_start == addr && - ((len == 0) || ((*parent)->vma->vm_end == end))) - goto found; - } + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + delete_vma_from_mm(vma); + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + add_vma_to_mm(mm, vma); - printk("munmap of non-mmaped memory by process %d (%s): %p\n", - current->pid, current->comm, (void *) addr); - return -EINVAL; + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(atomic_read(®ion->vm_usage) != 1); - found: - vml = *parent; + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) + region->vm_end = from; + else + region->vm_start = to; + add_nommu_region(region); + up_write(&nommu_region_sem); - put_vma(mm, vml->vma); + free_page_series(from, to); + return 0; +} - *parent = vml->next; - kfree(vml); +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + struct rb_node *rb; + unsigned long end = start + len; + int ret; - update_hiwater_vm(mm); - mm->total_vm -= len >> PAGE_SHIFT; + kenter(",%lx,%zx", start, len); -#ifdef DEBUG - show_process_blocks(); -#endif + if (len == 0) + return -EINVAL; + + /* find the first potentially overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d (%s):" + " 0x%lx-0x%lx\n", + current->pid, current->comm, start, start + len - 1); + return -EINVAL; + } + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) { + kleave(" = -EINVAL [miss]"); + return -EINVAL; + } + if (end == vma->vm_end) + goto erase_whole_vma; + rb = rb_next(&vma->vm_rb); + vma = rb_entry(rb, struct vm_area_struct, vm_rb); + } while (rb); + kleave(" = -EINVAL [split file]"); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) { + kleave(" = -EINVAL [superset]"); + return -EINVAL; + } + if (start & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned start]"); + return -EINVAL; + } + if (end != vma->vm_end && end & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned split]"); + return -EINVAL; + } + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) { + kleave(" = %d [split]", ret); + return ret; + } + } + return shrink_vma(mm, vma, start, end); + } + +erase_whole_vma: + delete_vma_from_mm(vma); + delete_vma(mm, vma); + kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); @@ -1184,29 +1552,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len) } /* - * Release all mappings + * release all the mappings made in a process's VM space */ -void exit_mmap(struct mm_struct * mm) +void exit_mmap(struct mm_struct *mm) { - struct vm_list_struct *tmp; + struct vm_area_struct *vma; - if (mm) { -#ifdef DEBUG - printk("Exit_mmap:\n"); -#endif + if (!mm) + return; - mm->total_vm = 0; + kenter(""); - while ((tmp = mm->context.vmlist)) { - mm->context.vmlist = tmp->next; - put_vma(mm, tmp->vma); - kfree(tmp); - } + mm->total_vm = 0; -#ifdef DEBUG - show_process_blocks(); -#endif + while ((vma = mm->mmap)) { + mm->mmap = vma->vm_next; + delete_vma_from_mm(vma); + delete_vma(mm, vma); } + + kleave(""); } unsigned long do_brk(unsigned long addr, unsigned long len) @@ -1219,8 +1584,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * under NOMMU conditions, we only permit changing a mapping's size, and only - * as long as it stays within the hole allocated by the kmalloc() call in - * do_mmap_pgoff() and the block is not shareable + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable * * MREMAP_FIXED is not supported under NOMMU conditions */ @@ -1231,13 +1596,16 @@ unsigned long do_mremap(unsigned long addr, struct vm_area_struct *vma; /* insanity checks first */ - if (new_len == 0) + if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - vma = find_vma_exact(current->mm, addr); + vma = find_vma_exact(current->mm, addr, old_len); if (!vma) return (unsigned long) -EINVAL; @@ -1247,19 +1615,19 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; - if (new_len > kobjsize((void *) addr)) + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) return (unsigned long) -ENOMEM; /* all checks complete - do it */ vma->vm_end = vma->vm_start + new_len; - return vma->vm_start; } EXPORT_SYMBOL(do_mremap); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +asmlinkage +unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) { unsigned long ret; -- cgit v1.2.3 From dd8632a12e500a684478fea0951f380478d56fed Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 8 Jan 2009 12:04:47 +0000 Subject: NOMMU: Make mmap allocation page trimming behaviour configurable. NOMMU mmap allocates a piece of memory for an mmap that's rounded up in size to the nearest power-of-2 number of pages. Currently it then discards the excess pages back to the page allocator, making that memory available for use by other things. This can, however, cause greater amount of fragmentation. To counter this, a sysctl is added in order to fine-tune the trimming behaviour. The default behaviour remains to trim pages aggressively, while this can either be disabled completely or set to a higher page-granular watermark in order to have finer-grained control. vm region vm_top bits taken from an earlier patch by David Howells. Signed-off-by: Paul Mundt Signed-off-by: David Howells Tested-by: Mike Frysinger --- Documentation/nommu-mmap.txt | 15 ++++++++++ Documentation/sysctl/vm.txt | 18 ++++++++++++ include/linux/mm_types.h | 1 + kernel/sysctl.c | 14 ++++++++++ mm/nommu.c | 65 ++++++++++++++++++++++++++++---------------- 5 files changed, 90 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 02b89dcf38ac..b565e8279d13 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -248,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT Provision of shared mappings on block device files is exactly the same as for character devices. If there isn't a real device underneath, then the driver should allocate sufficient contiguous memory to honour any supported mapping. + + +================================= +ADJUSTING PAGE TRIMMING BEHAVIOUR +================================= + +NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages +when performing an allocation. This can have adverse effects on memory +fragmentation, and as such, is left configurable. The default behaviour is to +aggressively trim allocations and discard any excess pages back in to the page +allocator. In order to retain finer-grained control over fragmentation, this +behaviour can either be disabled completely, or bumped up to a higher page +watermark where trimming begins. + +Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index cd05994a49e6..a3415070bcac 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - nr_hugepages - nr_overcommit_hugepages +- nr_trim_pages (only if CONFIG_MMU=n) ============================================================== @@ -348,3 +349,20 @@ Change the maximum size of the hugepage pool. The maximum is nr_hugepages + nr_overcommit_hugepages. See Documentation/vm/hugetlbpage.txt + +============================================================== + +nr_trim_pages + +This is available only on NOMMU kernels. + +This value adjusts the excess page trimming behaviour of power-of-2 aligned +NOMMU mmap allocations. + +A value of 0 disables trimming of allocations entirely, while a value of 1 +trims excess pages aggressively. Any value >= 1 acts as the watermark where +trimming of allocations is initiated. + +The default value is 1. + +See Documentation/nommu-mmap.txt for more information. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c1e0d3a1714..92915e81443f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -106,6 +106,7 @@ struct vm_region { unsigned long vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ + unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 92f6e5bc3c24..89d74436318c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ @@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#else + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = VM_LAPTOP_MODE, diff --git a/mm/nommu.c b/mm/nommu.c index 0d363dfcf10e..a6e8ccfbd400 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough * Copyright (c) 2000-2001 D Jeff Dionne * Copyright (c) 2002 Greg Ungerer - * Copyright (c) 2007 Paul Mundt + * Copyright (c) 2007-2008 Paul Mundt */ #include @@ -66,6 +66,7 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; atomic_t mmap_pages_allocated; @@ -455,6 +456,8 @@ static noinline void validate_nommu_regions(void) last = rb_entry(lastp, struct vm_region, vm_rb); if (unlikely(last->vm_end <= last->vm_start)) BUG(); + if (unlikely(last->vm_top < last->vm_end)) + BUG(); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); @@ -462,7 +465,9 @@ static noinline void validate_nommu_regions(void) if (unlikely(region->vm_end <= region->vm_start)) BUG(); - if (unlikely(region->vm_start < last->vm_end)) + if (unlikely(region->vm_top < region->vm_end)) + BUG(); + if (unlikely(region->vm_start < last->vm_top)) BUG(); lastp = p; @@ -536,7 +541,7 @@ static void free_page_series(unsigned long from, unsigned long to) /* * release a reference to a region * - the caller must hold the region semaphore, which this releases - * - the region may not have been added to the tree yet, in which case vm_end + * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ static void __put_nommu_region(struct vm_region *region) @@ -547,7 +552,7 @@ static void __put_nommu_region(struct vm_region *region) BUG_ON(!nommu_region_tree.rb_node); if (atomic_dec_and_test(®ion->vm_usage)) { - if (region->vm_end > region->vm_start) + if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -558,7 +563,7 @@ static void __put_nommu_region(struct vm_region *region) * from ramfs/tmpfs mustn't be released here */ if (region->vm_flags & VM_MAPPED_COPY) { kdebug("free series"); - free_page_series(region->vm_start, region->vm_end); + free_page_series(region->vm_start, region->vm_top); } kmem_cache_free(vm_region_jar, region); } else { @@ -999,6 +1004,10 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) int ret; ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; + } if (ret != -ENOSYS) return ret; @@ -1027,11 +1036,14 @@ static int do_mmap_private(struct vm_area_struct *vma, */ if (vma->vm_file) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); - if (ret != -ENOSYS) { + if (ret == 0) { /* shouldn't return success if we're not sharing */ - BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); - return ret; /* success or a real error */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return ret; } + if (ret != -ENOSYS) + return ret; /* getting an ENOSYS error indicates that direct mmap isn't * possible (as opposed to tried but failed) so we'll try to @@ -1051,23 +1063,25 @@ static int do_mmap_private(struct vm_area_struct *vma, if (!pages) goto enomem; - /* we allocated a power-of-2 sized page set, so we need to trim off the - * excess */ total = 1 << order; atomic_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; - while (total > point) { - order = ilog2(total - point); - n = 1 << order; - kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); - total -= n; - set_page_refcounted(pages + total); - __free_pages(pages + total, order); + + /* we allocated a power-of-2 sized page set, so we may want to trim off + * the excess */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + while (total > point) { + order = ilog2(total - point); + n = 1 << order; + kdebug("shave %lu/%lu @%lu", n, total - point, total); + atomic_sub(n, &mmap_pages_allocated); + total -= n; + set_page_refcounted(pages + total); + __free_pages(pages + total, order); + } } - total = rlen >> PAGE_SHIFT; for (point = 1; point < total; point++) set_page_refcounted(&pages[point]); @@ -1075,6 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + rlen; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); vma->vm_start = region->vm_start; vma->vm_end = region->vm_start + len; @@ -1110,6 +1125,7 @@ error_free: free_page_series(region->vm_start, region->vm_end); region->vm_start = vma->vm_start = 0; region->vm_end = vma->vm_end = 0; + region->vm_top = 0; return ret; enomem: @@ -1401,7 +1417,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, npages = (addr - vma->vm_start) >> PAGE_SHIFT; if (new_below) { - region->vm_end = new->vm_end = addr; + region->vm_top = region->vm_end = new->vm_end = addr; } else { region->vm_start = new->vm_start = addr; region->vm_pgoff = new->vm_pgoff += npages; @@ -1418,6 +1434,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; } else { vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; } add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); @@ -1454,10 +1471,12 @@ static int shrink_vma(struct mm_struct *mm, down_write(&nommu_region_sem); delete_nommu_region(region); - if (from > region->vm_start) - region->vm_end = from; - else + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { region->vm_start = to; + } add_nommu_region(region); up_write(&nommu_region_sem); -- cgit v1.2.3 From b9456371a73871d001e67b5f4eac118c2c278e1c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jan 2009 11:18:31 +0000 Subject: CRED: Fix commit_creds() on a process that has no mm Fix commit_creds()'s handling of a process that has no mm (such as one that is calling or has called daemonize()). commit_creds() should check to see if task->mm is not NULL before calling set_dumpable() on it. Reported-by: Jiri Slaby Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991c..480a61aec805 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -372,7 +372,8 @@ int commit_creds(struct cred *new) old->fsuid != new->fsuid || old->fsgid != new->fsgid || !cap_issubset(new->cap_permitted, old->cap_permitted)) { - set_dumpable(task->mm, suid_dumpable); + if (task->mm) + set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; smp_wmb(); } -- cgit v1.2.3 From 75139b8274c3e30354daea623f14b43a482a0bb5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:33 -0800 Subject: cgroups: remove some redundant NULL checks - In cgroup_clone(), if vfs_mkdir() returns successfully, dentry->d_fsdata will be the pointer to the newly created cgroup and won't be NULL. - a cgroup file's dentry->d_fsdata won't be NULL, guaranteed by cgroup_add_file(). - When walking through the subsystems of a cgroup_fs (using for_each_subsys), cgrp->subsys[ss->subsys_id] won't be NULL, guaranteed by cgroup_create(). (Also remove 2 unused variables in cgroup_rmdir(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f221446aa02d..220e0fd659fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -586,7 +586,7 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + if (ss->pre_destroy) ss->pre_destroy(ss, cgrp); return; } @@ -610,10 +610,8 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1445,7 +1443,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1490,7 +1488,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1554,10 +1552,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; + if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); @@ -2463,8 +2459,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ @@ -2487,8 +2481,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_lock(&cgroup_mutex); parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) @@ -2937,7 +2929,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, } /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + ret = vfs_mkdir(inode, dentry, 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { @@ -2947,13 +2939,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, goto out_release; } - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ -- cgit v1.2.3 From cae7a366f77ea5c9f54ae98c5fc65056877a89ed Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:34 -0800 Subject: ns_cgroup: remove unused spinlock I happened to find the spinlock in struct ns_cgroup is never used. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ns_cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 43c2111cd54d..78bc3fdac0d2 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -13,7 +13,6 @@ struct ns_cgroup { struct cgroup_subsys_state css; - spinlock_t lock; }; struct cgroup_subsys ns_subsys; @@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) return ERR_PTR(-ENOMEM); - spin_lock_init(&ns_cgroup->lock); return &ns_cgroup->css; } -- cgit v1.2.3 From b12b533fa523e94e0cc9dc23274ae4f9439f1313 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: add lock for child->cgroups in cgroup_post_fork() When cgroup_post_fork() is called, child is seen by find_task_by_vpid(), so child->cgroups maybe be changed, It'll incorrect. child->cgroups's refcnt is decreased child->cgroups's refcnt is increased but child->cg_list is added to child->cgroups's list. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 220e0fd659fa..d7ab4ffd8fd9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2792,8 +2792,10 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); + task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.3 From 2019f634ce5904c19eba4e86f51b1a119a53a9f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:36 -0800 Subject: cgroups: fix cgroup_iter_next() bug We access res->cgroups without the task_lock(), so res->cgroups may be changed. it's unreliable, and "if (l == &res->cgroups->tasks)" may be false forever. We don't need add any lock for fixing this bug. we just access to struct css_set by struct cg_cgroup_link, not by struct task_struct. Since we hold css_set_lock, struct cg_cgroup_link is reliable. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d7ab4ffd8fd9..a391ab3bdfc6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1808,6 +1808,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; + struct cg_cgroup_link *link; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) @@ -1815,7 +1816,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - if (l == &res->cgroups->tasks) { + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); -- cgit v1.2.3 From b2aa30f7bb381e04c93eed106089ba55553955f1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:37 -0800 Subject: cgroups: don't put struct cgroupfs_root protected by RCU We don't access struct cgroupfs_root in fast path, so we should not put struct cgroupfs_root protected by RCU But the comment in struct cgroup_subsys.root confuse us. struct cgroup_subsys.root is used in these places: 1 find_css_set(): if (ss->root->subsys_list.next == &ss->sibling) 2 rebind_subsystems(): if (ss->root != &rootnode) rcu_assign_pointer(ss->root, root); rcu_assign_pointer(subsys[i]->root, &rootnode); 3 cgroup_has_css_refs(): if (ss->root != cgrp->root) 4 cgroup_init_subsys(): ss->root = &rootnode; 5 proc_cgroupstats_show(): ss->name, ss->root->subsys_bits, ss->root->number_of_cgroups, !ss->disabled); 6 cgroup_clone(): root = subsys->root; if ((root != subsys->root) || All these place we have held cgroup_lock() or we don't dereference to struct cgroupfs_root. It's means wo don't need RCU when use struct cgroup_subsys.root, and we should not put struct cgroupfs_root protected by RCU. Signed-off-by: Lai Jiangshan Reviewed-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 08b78c09b09a..f68dfd8dd53a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -337,7 +337,6 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - /* Protected by RCU */ struct cgroupfs_root *root; struct list_head sibling; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a391ab3bdfc6..a288da176e46 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -713,7 +713,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); + ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -725,7 +725,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); + subsys[i]->root = &rootnode; list_del(&ss->sibling); } else if (bit & final_bits) { /* Subsystem state should already exist */ -- cgit v1.2.3 From 104cbd55377029e70fc2cee01089e84b9c36e5dc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:38 -0800 Subject: cgroups: use task_lock() for access tsk->cgroups safe in cgroup_clone() Use task_lock() protect tsk->cgroups and get_css_set(tsk->cgroups). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a288da176e46..00d5136d38c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,6 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } + task_lock(tsk); cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); @@ -2915,6 +2916,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, /* Keep the cgroup alive */ get_css_set(cg); + task_unlock(tsk); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ -- cgit v1.2.3 From 77efecd9e0526327548152df715ab8644ecb5ba0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:39 -0800 Subject: cgroups: call find_css_set() safely in cgroup_attach_task() In cgroup_attach_task(), tsk maybe exit when we call find_css_set(). and find_css_set() will access to invalid css_set. This patch increases the count before get_css_set(), and decreases it after find_css_set(). NOTE: css_set's refcount is also taskcount, after this patch applied, taskcount may be off-by-one WHEN cgroup_lock() is not held. but I reviewed other code which use taskcount, they are still correct. No regression found by reviewing and simply testing. So I do not use two counters in css_set. (one counter for taskcount, the other for refcount. like struct mm_struct) If this fix cause regression, we will use two counters in css_set. Signed-off-by: Lai Jiangshan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 00d5136d38c2..61e92c5867ea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1214,7 +1214,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; + struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; @@ -1234,11 +1234,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); + put_css_set(cg); if (!newcg) return -ENOMEM; -- cgit v1.2.3 From 7534432dcc3c654a8671b6b0cdffd1dbdbc73074 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:07:40 -0800 Subject: cgroups: remove rcu_read_lock() in cgroupstats_build() cgroup_iter_* do not need rcu_read_lock(). In cgroup_enable_task_cg_lists(), do_each_thread() and while_each_thread() are protected by RCU, it's OK, for write_lock(&css_set_lock) implies rcu_read_lock() in non-RT kernel. If we need explicit rcu_read_lock(), we should add rcu_read_lock() in cgroup_enable_task_cg_lists(), not cgroup_iter_*. Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 61e92c5867ea..f55af3daffc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2055,7 +2055,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { @@ -2080,7 +2079,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } cgroup_iter_end(cgrp, &it); - rcu_read_unlock(); err: return ret; } -- cgit v1.2.3 From e5f6a8609bab0c2d7543ab1505105e011832afd7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:41 -0800 Subject: cgroups: make root_list contains active hierarchies only Don't link rootnode to the root list, so root_list contains active hierarchies only as the comment indicates. And rename for_each_root() to for_each_active_root(). Also remove redundant check in cgroup_kill_sb(). Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f55af3daffc2..fd572d057691 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -84,7 +84,7 @@ struct cgroupfs_root { /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; - /* A list running through the mounted hierarchies */ + /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ @@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_subsys(_root, _ss) \ list_for_each_entry(_ss, &_root->subsys_list, sibling) -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by @@ -1111,10 +1111,9 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } + list_del(&root->root_list); + root_count--; + mutex_unlock(&cgroup_mutex); kfree(root); @@ -2559,7 +2558,6 @@ int __init cgroup_init_early(void) INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; @@ -2666,15 +2664,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); - for_each_root(root) { + for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); -- cgit v1.2.3 From 33a68ac1c1b695216e873ee12e819adbe73e4d9f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: add inactive subsystems to rootnode.subsys_list Though for an inactive hierarchy, we have subsys->root == &rootnode, but rootnode's subsys_list is always empty. This conflicts with the code in find_css_set(): for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ... if (ss->root->subsys_list.next == &ss->sibling) { ... } } if (list_empty(&rootnode.subsys_list)) { ... } The above code assumes rootnode.subsys_list links all inactive hierarchies. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd572d057691..abf7248f501a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -712,7 +712,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(dummytop->subsys[i]->cgroup != dummytop); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); @@ -726,7 +726,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; - list_del(&ss->sibling); + list_move(&ss->sibling, &rootnode.subsys_list); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2521,6 +2521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ -- cgit v1.2.3 From c12f65d4396e05c51ce3af7f159ead98574a587c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:07:42 -0800 Subject: cgroups: introduce link_css_set() to remove duplicate code Add a common function link_css_set() to link a css_set to a cgroup. Signed-off-by: Li Zefan Cc: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 68 +++++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abf7248f501a..4c475ce4e222 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_add(&link->cg_link_list, &cg->cg_links); +} + /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -399,7 +418,6 @@ static struct css_set *find_css_set( int i; struct list_head tmp_cg_links; - struct cg_cgroup_link *link; struct hlist_head *hhead; @@ -444,26 +462,11 @@ static struct css_set *find_css_set( * only do it for the first subsystem in each * hierarchy */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -988,7 +991,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, root = NULL; } else { /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; int i; @@ -1029,7 +1032,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, list_add(&root->root_list, &roots); root_count++; - sb->s_root->d_fsdata = &root->top_cgroup; + sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all @@ -1040,29 +1043,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct hlist_node *node; struct css_set *cg; - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cgroup_populate_dir(cgrp); + cgroup_populate_dir(root_cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } -- cgit v1.2.3 From e7b80bb695a5b64c92e314838e083b2f3bdf29b2 Mon Sep 17 00:00:00 2001 From: Gowrishankar M Date: Wed, 7 Jan 2009 18:07:43 -0800 Subject: cgroups: skip processes from other namespaces when listing a cgroup Once tasks are populated from system namespace inside cgroup, container replaces other namespace task with 0 while listing tasks, inside container. Though this is expected behaviour from container end, there is no use of showing unwanted 0s. In this patch, we check if a process is in same namespace before loading into pid array. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Gowrishankar M Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4c475ce4e222..cb7c72b91f46 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2007,14 +2007,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - int n = 0; + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_vnr(tsk); + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); return n; -- cgit v1.2.3 From a47295e6bc42ad35f9c15ac66f598aa24debd4e2 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:07:44 -0800 Subject: cgroups: make cgroup_path() RCU-safe Fix races between /proc/sched_debug by freeing cgroup objects via an RCU callback. Thus any cgroup reference obtained from an RCU-safe source will remain valid during the RCU section. Since dentries are also RCU-safe, this allows us to traverse up the tree safely. Additionally, make cgroup_path() check for a NULL cgrp->dentry to avoid trying to report a path for a partially-created cgroup. [lizf@cn.fujitsu.com: call deactive_super() in cgroup_diput()] Signed-off-by: Paul Menage Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Peter Zijlstra Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 5 ++++- kernel/cgroup.c | 30 +++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f68dfd8dd53a..73d1c730c3c4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -116,7 +116,7 @@ struct cgroup { struct list_head children; /* my children */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry */ + struct dentry *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -145,6 +145,9 @@ struct cgroup { int pids_use_count; /* Length of the current tasks_pids array */ int pids_length; + + /* For RCU-protected deletion */ + struct rcu_head rcu_head; }; /* A css_set is a structure holding pointers to a set of diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb7c72b91f46..83ea4f524be5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) @@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } +static void free_cgroup_rcu(struct rcu_head *obj) +{ + struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); + + kfree(cgrp); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ deactivate_super(cgrp->root->sb); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); } @@ -1134,14 +1143,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; + struct dentry *dentry = rcu_dereference(cgrp->dentry); - if (cgrp == dummytop) { + if (!dentry || cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1154,13 +1165,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { - int len = cgrp->dentry->d_name.len; + int len = dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; + dentry = rcu_dereference(cgrp->dentry); if (!cgrp->parent) continue; if (--start < buf) @@ -1663,7 +1675,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); - cgrp->dentry = dentry; + rcu_assign_pointer(cgrp->dentry, dentry); dget(dentry); } dput(dentry); -- cgit v1.2.3 From 28dbc4b6a01fb579a9441c7b81e3d3413dc452df Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 7 Jan 2009 18:08:05 -0800 Subject: memcg: memory cgroup resource counters for hierarchy Add support for building hierarchies in resource counters. Cgroups allows us to build a deep hierarchy, but we currently don't link the resource counters belonging to the memory controller control groups, in the same fashion as the corresponding cgroup entries in the cgroup hierarchy. This patch provides the infrastructure for resource counters that have the same hiearchy as their cgroup counter parts. These set of patches are based on the resource counter hiearchy patches posted by Pavel Emelianov. NOTE: Building hiearchies is expensive, deeper hierarchies imply charging the all the way up to the root. It is known that hiearchies are expensive, so the user needs to be careful and aware of the trade-offs before creating very deep ones. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Balbir Singh Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: Li Zefan Cc: David Rientjes Cc: Pavel Emelianov Cc: Dhaval Giani Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 8 ++++++-- kernel/res_counter.c | 44 +++++++++++++++++++++++++++++++++++--------- mm/memcontrol.c | 20 +++++++++++++------- 3 files changed, 54 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 271c1c2c9f6f..dede0a2cfc45 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -43,6 +43,10 @@ struct res_counter { * the routines below consider this to be IRQ-safe */ spinlock_t lock; + /* + * Parent counter, used for hierarchial resource accounting + */ + struct res_counter *parent; }; /** @@ -87,7 +91,7 @@ enum { * helpers for accounting */ -void res_counter_init(struct res_counter *counter); +void res_counter_init(struct res_counter *counter, struct res_counter *parent); /* * charge - try to consume more resource. @@ -103,7 +107,7 @@ void res_counter_init(struct res_counter *counter); int __must_check res_counter_charge_locked(struct res_counter *counter, unsigned long val); int __must_check res_counter_charge(struct res_counter *counter, - unsigned long val); + unsigned long val, struct res_counter **limit_fail_at); /* * uncharge - tell that some portion of the resource is released diff --git a/kernel/res_counter.c b/kernel/res_counter.c index f275c8eca772..bf8e7534c803 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -15,10 +15,11 @@ #include #include -void res_counter_init(struct res_counter *counter) +void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->parent = parent; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) return 0; } -int res_counter_charge(struct res_counter *counter, unsigned long val) +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) { int ret; unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - ret = res_counter_charge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + struct res_counter *c, *u; + + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + ret = res_counter_charge_locked(c, val); + spin_unlock(&c->lock); + if (ret < 0) { + *limit_fail_at = c; + goto undo; + } + } + ret = 0; + goto done; +undo: + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } +done: + local_irq_restore(flags); return ret; } @@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) void res_counter_uncharge(struct res_counter *counter, unsigned long val) { unsigned long flags; + struct res_counter *c; - spin_lock_irqsave(&counter->lock, flags); - res_counter_uncharge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + res_counter_uncharge_locked(c, val); + spin_unlock(&c->lock); + } + local_irq_restore(flags); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9846f617115d..e72fb2b4a7d8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -471,6 +471,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, { struct mem_cgroup *mem; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct res_counter *fail_res; /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -499,11 +500,12 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, int ret; bool noswap = false; - ret = res_counter_charge(&mem->res, PAGE_SIZE); + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) break; - ret = res_counter_charge(&mem->memsw, PAGE_SIZE); + ret = res_counter_charge(&mem->memsw, PAGE_SIZE, + &fail_res); if (likely(!ret)) break; /* mem+swap counter fails */ @@ -1709,22 +1711,26 @@ static void __init enable_swap_cgroup(void) static struct cgroup_subsys_state * mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem; + struct mem_cgroup *mem, *parent; int node; mem = mem_cgroup_alloc(); if (!mem) return ERR_PTR(-ENOMEM); - res_counter_init(&mem->res); - res_counter_init(&mem->memsw); - for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; /* root ? */ - if (cont->parent == NULL) + if (cont->parent == NULL) { enable_swap_cgroup(); + parent = NULL; + } else + parent = mem_cgroup_from_cont(cont->parent); + + res_counter_init(&mem->res, parent ? &parent->res : NULL); + res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL); + return &mem->css; free_out: -- cgit v1.2.3 From 999cd8a450f8f93701669a61cac4d3b19eca07e8 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:36 -0800 Subject: cgroups: add a per-subsystem hierarchy_mutex These patches introduce new locking/refcount support for cgroups to reduce the need for subsystems to call cgroup_lock(). This will ultimately allow the atomicity of cgroup_rmdir() (which was removed recently) to be restored. These three patches give: 1/3 - introduce a per-subsystem hierarchy_mutex which a subsystem can use to prevent changes to its own cgroup tree 2/3 - use hierarchy_mutex in place of calling cgroup_lock() in the memory controller 3/3 - introduce a css_tryget() function similar to the one recently proposed by Kamezawa, but avoiding spurious refcount failures in the event of a race between a css_tryget() and an unsuccessful cgroup_rmdir() Future patches will likely involve: - using hierarchy mutex in place of cgroup_lock() in more subsystems where appropriate - restoring the atomicity of cgroup_rmdir() with respect to cgroup_create() This patch: Add a hierarchy_mutex to the cgroup_subsys object that protects changes to the hierarchy observed by that subsystem. It is taken by the cgroup subsystem (in addition to cgroup_mutex) for the following operations: - linking a cgroup into that subsystem's cgroup tree - unlinking a cgroup from that subsystem's cgroup tree - moving the subsystem to/from a hierarchy (including across the bind() callback) Thus if the subsystem holds its own hierarchy_mutex, it can safely traverse its own hierarchy. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 2 +- include/linux/cgroup.h | 17 ++++++++++++++++- kernel/cgroup.c | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 60287e9e9d27..e33ee74eee77 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -528,7 +528,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set up. void bind(struct cgroup_subsys *ss, struct cgroup *root) -(cgroup_mutex held by caller) +(cgroup_mutex and ss->hierarchy_mutex held by caller) Called when a cgroup subsystem is rebound to a different hierarchy and root cgroup. Currently this will only involve movement between diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 73d1c730c3c4..ce1c1f34c30c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -340,8 +340,23 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - struct cgroupfs_root *root; + /* + * Protects sibling/children links of cgroups in this + * hierarchy, plus protects which hierarchy (or none) the + * subsystem is a part of (i.e. root/sibling). To avoid + * potential deadlocks, the following operations should not be + * undertaken while holding any hierarchy_mutex: + * + * - allocating memory + * - initiating hotplug events + */ + struct mutex hierarchy_mutex; + /* + * Link to parent, and list entry in parent's children. + * Protected by this->hierarchy_mutex and cgroup_lock() + */ + struct cgroupfs_root *root; struct list_head sibling; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83ea4f524be5..8b6379cdf637 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -722,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(ss, cgrp); - + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -2338,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; } +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_lock_nested(&ss->hierarchy_mutex, i); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -2386,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_css(css, ss, cgrp); } + cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -2504,8 +2532,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); @@ -2547,6 +2579,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); + mutex_init(&ss->hierarchy_mutex); ss->active = 1; } -- cgit v1.2.3 From e7c5ec9193d32b9559a3bb8893ceedbda85201ff Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Wed, 7 Jan 2009 18:08:38 -0800 Subject: cgroups: add css_tryget() Add css_tryget(), that obtains a counted reference on a CSS. It is used in situations where the caller has a "weak" reference to the CSS, i.e. one that does not protect the cgroup from removal via a reference count, but would instead be cleaned up by a destroy() callback. css_tryget() will return true on success, or false if the cgroup is being removed. This is similar to Kamezawa Hiroyuki's patch from a week or two ago, but with the difference that in the event of css_tryget() racing with a cgroup_rmdir(), css_tryget() will only return false if the cgroup really does get removed. This implementation is done by biasing css->refcnt, so that a refcnt of 1 means "releasable" and 0 means "released or releasing". In the event of a race, css_tryget() distinguishes between "released" and "releasing" by checking for the CSS_REMOVED flag in css->flags. Signed-off-by: Paul Menage Tested-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 38 ++++++++++++++++++++++++++----- kernel/cgroup.c | 61 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 88 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ce1c1f34c30c..e267e62827bb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -52,9 +52,9 @@ struct cgroup_subsys_state { * hierarchy structure */ struct cgroup *cgroup; - /* State maintained by the cgroup system to allow - * subsystems to be "busy". Should be accessed via css_get() - * and css_put() */ + /* State maintained by the cgroup system to allow subsystems + * to be "busy". Should be accessed via css_get(), + * css_tryget() and and css_put(). */ atomic_t refcnt; @@ -64,11 +64,14 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { CSS_ROOT, /* This CSS is the root of the subsystem */ + CSS_REMOVED, /* This CSS is dead */ }; /* - * Call css_get() to hold a reference on the cgroup; - * + * Call css_get() to hold a reference on the css; it can be used + * for a reference obtained via: + * - an existing ref-counted reference to the css + * - task->cgroups for a locked task */ static inline void css_get(struct cgroup_subsys_state *css) @@ -77,9 +80,32 @@ static inline void css_get(struct cgroup_subsys_state *css) if (!test_bit(CSS_ROOT, &css->flags)) atomic_inc(&css->refcnt); } + +static inline bool css_is_removed(struct cgroup_subsys_state *css) +{ + return test_bit(CSS_REMOVED, &css->flags); +} + +/* + * Call css_tryget() to take a reference on a css if your existing + * (known-valid) reference isn't already ref-counted. Returns false if + * the css has been destroyed. + */ + +static inline bool css_tryget(struct cgroup_subsys_state *css) +{ + if (test_bit(CSS_ROOT, &css->flags)) + return true; + while (!atomic_inc_not_zero(&css->refcnt)) { + if (test_bit(CSS_REMOVED, &css->flags)) + return false; + } + return true; +} + /* * css_put() should be called to release a reference taken by - * css_get() + * css_get() or css_tryget() */ extern void __css_put(struct cgroup_subsys_state *css); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b6379cdf637..c29831076e7a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2333,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2465,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should + * cgroup, if the css refcount is also 1, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since @@ -2486,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; } +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + do { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -2522,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) - || cgroup_has_css_refs(cgrp)) { + || !cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -3078,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + if ((atomic_dec_return(&css->refcnt) == 1) && + notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } -- cgit v1.2.3 From 13337714f3b0307dc7f75ef5d83ecf0db2abbd65 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 7 Jan 2009 18:08:39 -0800 Subject: cpuset: rcu_read_lock() to protect task_cs() task_cs() calls task_subsys_state(). We must use rcu_read_lock() to protect cgroup_subsys_state(). It's correct that top_cpuset is never freed, but cgroup_subsys_state() accesses css_set, this css_set maybe freed when task_cs() called. We use use rcu_read_lock() to protect it. Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 345ace5117de..a841b5c01ef9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -375,14 +375,9 @@ void cpuset_update_task_memory_state(void) struct task_struct *tsk = current; struct cpuset *cs; - if (task_cs(tsk) == &top_cpuset) { - /* Don't need rcu for top_cpuset. It's never freed. */ - my_cpusets_mem_gen = top_cpuset.mems_generation; - } else { - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - } + rcu_read_lock(); + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; + rcu_read_unlock(); if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); -- cgit v1.2.3 From f5813d94279a18ff5936d675e24b44b44a571197 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 7 Jan 2009 18:08:40 -0800 Subject: cpusets: set task's cpu_allowed to cpu_possible_map when attaching it into top cpuset I found a bug on my dual-cpu box. I created a sub cpuset in top cpuset and assign 1 to its cpus. And then we attach some tasks into this sub cpuset. After this, we offline CPU1. Now, the tasks in this new cpuset are moved into top cpuset automatically because there is no cpu in sub cpuset. Then we online CPU1, we find all the tasks which doesn't belong to top cpuset originally just run on CPU0. We fix this bug by setting task's cpu_allowed to cpu_possible_map when attaching it into top cpuset. This method needn't modify the current behavior of cpusets on CPU hotplug, and all of tasks in top cpuset use cpu_possible_map to initialize their cpu_allowed. Signed-off-by: Miao Xie Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a841b5c01ef9..6012e326e856 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1338,10 +1338,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cpuset *oldcs = cgroup_cs(oldcont); int err; - mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); + if (cs == &top_cpuset) { + cpus = cpu_possible_map; + } else { + mutex_lock(&callback_mutex); + guarantee_online_cpus(cs, &cpus); + mutex_unlock(&callback_mutex); + } err = set_cpus_allowed_ptr(tsk, &cpus); - mutex_unlock(&callback_mutex); if (err) return; -- cgit v1.2.3 From 5a7625df725a486ad600b0036b6111dbd6321c41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:41 -0800 Subject: cpuset: remove on stack cpumask_t in cpuset_sprintf_cpulist() This patchset converts cpuset to use new cpumask API, and thus remove on stack cpumask_t to reduce stack usage. Before: # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t 21 After: # cat kernel/cpuset.c include/linux/cpuset.h | grep -c cpumask_t 0 This patch: Impact: reduce stack usage It's safe to call cpulist_scnprintf inside callback_mutex, and thus we can just remove the cpumask_t and no need to allocate a cpumask_var_t. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6012e326e856..41c2343df975 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1486,13 +1486,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - cpumask_t mask; + int ret; mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed); mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, &mask); + return ret; } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) -- cgit v1.2.3 From 5771f0a2236df69683e9abea87f35f522c97ff94 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:41 -0800 Subject: cpuset: remove on stack cpumask_t in cpuset_can_attach() Impact: reduce stack usage Just use cs->cpus_allowed, and no need to allocate a cpumask_var_t. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41c2343df975..afa29cfc5bbb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1311,20 +1311,19 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) { struct cpuset *cs = cgroup_cs(cont); + int ret = 0; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - if (tsk->flags & PF_THREAD_BOUND) { - cpumask_t mask; + if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed)) + ret = -EINVAL; mutex_unlock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, mask)) - return -EINVAL; } - return security_task_setscheduler(tsk, 0, NULL); + return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); } static void cpuset_attach(struct cgroup_subsys *ss, -- cgit v1.2.3 From 2341d1b6598c7146d64a5050b53a72a5a819617f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:42 -0800 Subject: cpuset: convert cpuset_attach() to use cpumask_var_t Impact: reduce stack usage Allocate a global cpumask_var_t at boot, and use it in cpuset_attach(), so we won't fail cpuset_attach(). Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index afa29cfc5bbb..1e32e6b380af 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1306,6 +1306,9 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Protected by cgroup_lock */ +static cpumask_var_t cpus_attach; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) @@ -1330,7 +1333,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk) { - cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); @@ -1338,13 +1340,13 @@ static void cpuset_attach(struct cgroup_subsys *ss, int err; if (cs == &top_cpuset) { - cpus = cpu_possible_map; + cpumask_copy(cpus_attach, cpu_possible_mask); } else { mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); + guarantee_online_cpus(cs, cpus_attach); mutex_unlock(&callback_mutex); } - err = set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; @@ -1357,7 +1359,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - } /* The various types of files and directories in a cpuset file system */ @@ -1838,6 +1839,9 @@ int __init cpuset_init(void) if (err < 0) return err; + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + number_of_cpusets = 1; return 0; } -- cgit v1.2.3 From 645fcc9d2f6946f97a41c8d00edee38f8a6f0060 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:43 -0800 Subject: cpuset: don't allocate trial cpuset on stack Impact: cleanups, reduce stack usage This patch prepares for the next patch. When we convert cpuset.cpus_allowed to cpumask_var_t, (trialcs = *cs) no longer works. Another result of this patch is reducing stack usage of trialcs. sizeof(*cs) can be as large as 148 bytes on x86_64, so it's really not good to have it on stack. Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 93 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1e32e6b380af..f66527bfd216 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -415,6 +415,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) is_mem_exclusive(p) <= is_mem_exclusive(q); } +/** + * alloc_trial_cpuset - allocate a trial cpuset + * @cs: the cpuset that the trial cpuset duplicates + */ +static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +{ + return kmemdup(cs, sizeof(*cs), GFP_KERNEL); +} + +/** + * free_trial_cpuset - free the trial cpuset + * @trial: the trial cpuset to be freed + */ +static void free_trial_cpuset(struct cpuset *trial) +{ + kfree(trial); +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -880,10 +898,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */ -static int update_cpumask(struct cpuset *cs, const char *buf) +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { struct ptr_heap heap; - struct cpuset trialcs; int retval; int is_load_balanced; @@ -891,8 +909,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case @@ -900,31 +916,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs.cpus_allowed); + cpus_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, &trialcs.cpus_allowed); + retval = cpulist_parse(buf, &trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map)) return -EINVAL; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; - is_load_balanced = is_sched_load_balance(&trialcs); + is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs.cpus_allowed; + cs->cpus_allowed = trialcs->cpus_allowed; mutex_unlock(&callback_mutex); /* @@ -1099,9 +1115,9 @@ done: * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ -static int update_nodemask(struct cpuset *cs, const char *buf) +static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { - struct cpuset trialcs; nodemask_t oldmem; int retval; @@ -1112,8 +1128,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case @@ -1121,27 +1135,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf) * with tasks have memory. */ if (!*buf) { - nodes_clear(trialcs.mems_allowed); + nodes_clear(trialcs->mems_allowed); } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); + retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; - if (!nodes_subset(trialcs.mems_allowed, + if (!nodes_subset(trialcs->mems_allowed, node_states[N_HIGH_MEMORY])) return -EINVAL; } oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { + if (nodes_equal(oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; + cs->mems_allowed = trialcs->mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); @@ -1181,31 +1195,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { - struct cpuset trialcs; + struct cpuset *trialcs; int err; int balance_flag_changed; - trialcs = *cs; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + if (turning_on) - set_bit(bit, &trialcs.flags); + set_bit(bit, &trialcs->flags); else - clear_bit(bit, &trialcs.flags); + clear_bit(bit, &trialcs->flags); - err = validate_change(cs, &trialcs); + err = validate_change(cs, trialcs); if (err < 0) - return err; + goto out; balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(&trialcs)); + is_sched_load_balance(trialcs)); mutex_lock(&callback_mutex); - cs->flags = trialcs.flags; + cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) + if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); - return 0; +out: + free_trial_cpuset(trialcs); + return err; } /* @@ -1453,21 +1472,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *trialcs; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + switch (cft->private) { case FILE_CPULIST: - retval = update_cpumask(cgroup_cs(cgrp), buf); + retval = update_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: - retval = update_nodemask(cgroup_cs(cgrp), buf); + retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } + + free_trial_cpuset(trialcs); cgroup_unlock(); return retval; } -- cgit v1.2.3 From 300ed6cbb70718872cb4936d1d22ef295f9ba44d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:44 -0800 Subject: cpuset: convert cpuset->cpus_allowed to cpumask_var_t Impact: use new cpumask API This patch mainly does the following things: - change cs->cpus_allowed from cpumask_t to cpumask_var_t - call alloc_bootmem_cpumask_var() for top_cpuset in cpuset_init_early() - call alloc_cpumask_var() for other cpusets - replace cpus_xxx() to cpumask_xxx() Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 100 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f66527bfd216..fc294aa9a97a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -84,7 +84,7 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ struct cpuset *parent; /* my parent */ @@ -195,8 +195,6 @@ static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .cpus_allowed = CPU_MASK_ALL, - .mems_allowed = NODE_MASK_ALL, }; /* @@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = { }; /* - * Return in *pmask the portion of a cpusets's cpus_allowed that + * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, @@ -293,13 +291,13 @@ static struct file_system_type cpuset_fs_type = { static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) { - while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; if (cs) - cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else - *pmask = cpu_online_map; - BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); + cpumask_copy(pmask, cpu_online_mask); + BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); } /* @@ -409,7 +407,7 @@ void cpuset_update_task_memory_state(void) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpus_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); @@ -421,7 +419,19 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) */ static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) { - return kmemdup(cs, sizeof(*cs), GFP_KERNEL); + struct cpuset *trial; + + trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + if (!trial) + return NULL; + + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { + kfree(trial); + return NULL; + } + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + + return trial; } /** @@ -430,6 +440,7 @@ static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -482,7 +493,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -492,7 +503,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ if (cgroup_task_count(cur->css.cgroup)) { - if (cpus_empty(trial->cpus_allowed) || + if (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed)) { return -ENOSPC; } @@ -507,7 +518,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpus_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); } static void @@ -532,7 +543,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; if (is_sched_load_balance(cp)) @@ -627,7 +638,7 @@ static int generate_sched_domains(cpumask_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - *doms = top_cpuset.cpus_allowed; + cpumask_copy(doms, top_cpuset.cpus_allowed); ndoms = 1; goto done; @@ -646,7 +657,7 @@ static int generate_sched_domains(cpumask_t **domains, cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; /* @@ -739,7 +750,7 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); + cpumask_or(dp, dp, b->cpus_allowed); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -848,7 +859,7 @@ void rebuild_sched_domains(void) static int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - return !cpus_equal(tsk->cpus_allowed, + return !cpumask_equal(&tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); } @@ -866,7 +877,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -916,13 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, &trialcs->cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs->cpus_allowed, cpu_online_map)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -930,7 +941,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); @@ -940,7 +951,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); /* @@ -1028,7 +1039,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ - fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ retval = -ENOMEM; /* @@ -1176,7 +1187,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) async_rebuild_sched_domains(); } @@ -1219,7 +1231,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs->cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); out: @@ -1335,12 +1347,12 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cpuset *cs = cgroup_cs(cont); int ret = 0; - if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, cs->cpus_allowed)) + if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) ret = -EINVAL; mutex_unlock(&callback_mutex); } @@ -1516,7 +1528,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) int ret; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, &cs->cpus_allowed); + ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); return ret; @@ -1755,7 +1767,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, parent_cs = cgroup_cs(parent); cs->mems_allowed = parent_cs->mems_allowed; - cs->cpus_allowed = parent_cs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); return; } @@ -1781,6 +1793,10 @@ static struct cgroup_subsys_state *cpuset_create( cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } cpuset_update_task_memory_state(); cs->flags = 0; @@ -1789,7 +1805,7 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpus_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); @@ -1816,6 +1832,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1839,6 +1856,8 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { + alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + top_cpuset.mems_generation = cpuset_mems_generation++; return 0; } @@ -1854,7 +1873,7 @@ int __init cpuset_init(void) { int err = 0; - cpus_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); @@ -1943,7 +1962,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed) || + while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent->parent; @@ -1984,7 +2003,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -1992,13 +2011,14 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); - cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_online_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ - if (cpus_empty(cp->cpus_allowed) || + if (cpumask_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { @@ -2039,7 +2059,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, } cgroup_lock(); - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2084,7 +2104,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2096,7 +2116,7 @@ void __init cpuset_init_smp(void) * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * - * Description: Returns the cpumask_t cpus_allowed of the cpuset + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. -- cgit v1.2.3 From 6af866af34a96fed24a55979a78b6f73bd4e8e87 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Jan 2009 18:08:45 -0800 Subject: cpuset: remove remaining pointers to cpumask_t Impact: cleanups, use new cpumask API Final trivial cleanups: mainly s/cpumask_t/struct cpumask Note there is a FIXME in generate_sched_domains(). A future patch will change struct cpumask *doms to struct cpumask *doms[]. (I suppose Rusty will do this.) Signed-off-by: Li Zefan Cc: Ingo Molnar Cc: Rusty Russell Acked-by: Mike Travis Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 10 ++++++---- kernel/cpuset.c | 28 +++++++++++++++------------- 2 files changed, 21 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 51ea2bdea0f9..90c6074a36ca 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,8 +20,9 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask); -extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask); +extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +extern void cpuset_cpus_allowed_locked(struct task_struct *p, + struct cpumask *mask); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -86,12 +87,13 @@ static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask) +static inline void cpuset_cpus_allowed(struct task_struct *p, + struct cpumask *mask) { *mask = cpu_possible_map; } static inline void cpuset_cpus_allowed_locked(struct task_struct *p, - cpumask_t *mask) + struct cpumask *mask) { *mask = cpu_possible_map; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc294aa9a97a..647c77a88fcb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -289,7 +289,8 @@ static struct file_system_type cpuset_fs_type = { * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +static void guarantee_online_cpus(const struct cpuset *cs, + struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; @@ -610,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -static int generate_sched_domains(cpumask_t **domains, +/* FIXME: see the FIXME in partition_sched_domains() */ +static int generate_sched_domains(struct cpumask **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -618,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct cpumask *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] cpumask_t slot */ + int nslot; /* next empty doms[] struct cpumask slot */ doms = NULL; dattr = NULL; @@ -629,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -708,7 +710,7 @@ restart: * Now we know how many domains to create. * Convert to and populate cpu masks. */ - doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -720,7 +722,7 @@ restart: for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; - cpumask_t *dp; + struct cpumask *dp; int apn = a->pn; if (apn < 0) { @@ -743,7 +745,7 @@ restart: continue; } - cpus_clear(*dp); + cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { @@ -790,7 +792,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; get_online_cpus(); @@ -2044,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; switch (phase) { @@ -2114,7 +2116,7 @@ void __init cpuset_init_smp(void) /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -2122,7 +2124,7 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); cpuset_cpus_allowed_locked(tsk, pmask); @@ -2133,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { task_lock(tsk); guarantee_online_cpus(task_cs(tsk), pmask); -- cgit v1.2.3 From 61bce0f1371cfff497fe85594fd39d1a0b15ebe1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 7 Jan 2009 18:08:49 -0800 Subject: pid: generalize task_active_pid_ns Currently task_active_pid_ns is not safe to call after a task becomes a zombie and exit_task_namespaces is called, as nsproxy becomes NULL. By reading the pid namespace from the pid of the task we can trivially solve this problem at the cost of one extra memory read in what should be the same cacheline as we read the namespace from. When moving things around I have made task_active_pid_ns out of line because keeping it in pid_namespace.h would require adding includes of pid.h and sched.h that I don't think we want. This change does make task_active_pid_ns unsafe to call during copy_process until we attach a pid on the task_struct which seems to be a reasonable trade off. Signed-off-by: Eric W. Biederman Signed-off-by: Sukadev Bhattiprolu Cc: Oleg Nesterov Cc: Roland McGrath Cc: Bastian Blank Cc: Pavel Emelyanov Cc: Nadia Derbey Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid_namespace.h | 6 +----- kernel/fork.c | 4 ++-- kernel/pid.c | 6 ++++++ 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index d82fe825d62f..38d10326246a 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -79,11 +79,7 @@ static inline void zap_pid_ns_processes(struct pid_namespace *ns) } #endif /* CONFIG_PID_NS */ -static inline struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) -{ - return tsk->nsproxy->pid_ns; -} - +extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pidmap_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3d..4018308048cf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } diff --git a/kernel/pid.c b/kernel/pid.c index af9224cdd6c0..1b3586fe753a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) } EXPORT_SYMBOL(task_session_nr_ns); +struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) +{ + return ns_of_pid(task_pid(tsk)); +} +EXPORT_SYMBOL_GPL(task_active_pid_ns); + /* * Used by proc to find the first pid that is greater than or equal to nr. * -- cgit v1.2.3 From df4927bf6ccf6278a97a44bd107080c71b269cb5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Jan 2009 18:09:14 -0800 Subject: generic swap(): sched: remove local swap() macro Use the new generic implementation. Signed-off-by: Wu Fengguang Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched_fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e0c0b4bc3f08..8e1352c75557 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } } -#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) - /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks -- cgit v1.2.3 From 33b04b9308959af7febc1c111c766fa3fd8b1934 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 8 Jan 2009 12:35:11 -0800 Subject: async: make async_synchronize_full() more serializing turns out that there are real problems with allowing async tasks that are scheduled from async tasks to run after the async_synchronize_full() returns. This patch makes the _full more strict and a complete synchronization. Later I might need to add back a lighter form of synchronization for other uses.. but not right now. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 97373380c9e7..64cc916299a5 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -206,7 +206,9 @@ EXPORT_SYMBOL_GPL(async_schedule_special); void async_synchronize_full(void) { - async_synchronize_cookie(next_cookie); + do { + async_synchronize_cookie(next_cookie); + } while (!list_empty(&async_running) || !list_empty(&async_pending)); } EXPORT_SYMBOL_GPL(async_synchronize_full); -- cgit v1.2.3 From 0de336814107358bc8c4173bf9ce2d42445173fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jan 2009 16:13:41 +0000 Subject: CRED: Missing put_cred() in prepare_kernel_cred() Missing put_cred() in the error handling path of prepare_kernel_cred(). Signed-off-by: David Howells Acked-by: Steve Dickson Signed-off-by: Linus Torvalds --- kernel/cred.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991c..fc222e4acfb0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -529,6 +529,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) error: put_cred(new); + put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); -- cgit v1.2.3 From 43529c97122f2c851126447963eedcb8cba74fbe Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jan 2009 16:13:46 +0000 Subject: CRED: Must initialise the new creds in prepare_kernel_cred() The newly allocated creds in prepare_kernel_cred() must be initialised before get_uid() and get_group_info() can access them. They should be copied from the old credentials. Reported-by: Steve Dickson Signed-off-by: David Howells Acked-by: Steve Dickson Signed-off-by: Linus Torvalds --- kernel/cred.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index fc222e4acfb0..043f78c133c4 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -506,6 +506,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) else old = get_cred(&init_cred); + *new = *old; get_uid(new->user); get_group_info(new->group_info); -- cgit v1.2.3 From cdb80f630be5cbc23d82331f24dc4704f75b64f4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 9 Jan 2009 13:23:45 -0800 Subject: async: make async a command line option for now ... and have it default off. This does allow people to work with it for testing. Signed-off-by: Arjan van de Ven --- kernel/async.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 64cc916299a5..f286e9f2b736 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -65,6 +65,8 @@ static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); +static int async_enabled = 0; + struct async_entry { struct list_head list; async_cookie_t cookie; @@ -169,7 +171,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!entry || atomic_read(&entry_count) > MAX_WORK) { + if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -316,8 +318,18 @@ static int async_manager_thread(void *unused) static int __init async_init(void) { - kthread_run(async_manager_thread, NULL, "async/mgr"); + if (async_enabled) + kthread_run(async_manager_thread, NULL, "async/mgr"); return 0; } +static int __init setup_async(char *str) +{ + async_enabled = 1; + return 1; +} + +__setup("fastboot", setup_async); + + core_initcall(async_init); -- cgit v1.2.3 From 62ea9ceb17a74bc7544211bfeecf4170c554ac4f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sun, 11 Jan 2009 01:04:16 +0100 Subject: cpumask: fix CONFIG_NUMA=y sched.c Impact: fix panic on ia64 with NR_CPUS=1024 struct sched_domain is now a dangling structure; where we really want static ones, we need to use static_sched_domain. (As the FIXME in this file says, cpumask_var_t would be better, but this code is hairy enough without trying to add initialization code to the right places). Reported-by: Mike Travis Signed-off-by: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index deb5ac8c12f3..f0c0a81d7638 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7282,10 +7282,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, * groups, so roll our own. Now each node has its own list of groups which * gets dynamically allocated. */ -static DEFINE_PER_CPU(struct sched_domain, node_domains); +static DEFINE_PER_CPU(struct static_sched_domain, node_domains); static struct sched_group ***sched_group_nodes_bycpu; -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, @@ -7560,7 +7560,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { - sd = &per_cpu(allnodes_domains, i); + sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); @@ -7570,7 +7570,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } else p = NULL; - sd = &per_cpu(node_domains, i); + sd = &per_cpu(node_domains, i).sd; SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); @@ -7688,7 +7688,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(j, nodemask) { struct sched_domain *sd; - sd = &per_cpu(node_domains, j); + sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; -- cgit v1.2.3 From 805194c35b91999b139e4d6b6145f4f84fd4c814 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 10 Jan 2009 15:43:15 +0800 Subject: sched: partly revert "sched debug: remove NULL checking in print_cfs_rt_rq()" Impact: avoid accessing NULL tg.css->cgroup In commit 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d, I removed checking NULL tg.css->cgroup, but I realized I was wrong when I found reading /proc/sched_debug can race with cgroup_create(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4293cfa9681d..16eeba4e4169 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } +#if defined(CONFIG_CGROUP_SCHED) && \ + (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) +static void task_group_path(struct task_group *tg, char *buf, int buflen) +{ + /* may be NULL if the underlying cgroup isn't fully-created yet */ + if (!tg->css.cgroup) { + buf[0] = '\0'; + return; + } + cgroup_path(tg->css.cgroup, buf, buflen); +} +#endif + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = cfs_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) @@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128] = ""; + char path[128]; struct task_group *tg = rt_rq->tg; - cgroup_path(tg->css.cgroup, path, sizeof(path)); + task_group_path(tg, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else -- cgit v1.2.3 From 53ce3d9564908794ae7dd32969089b57df5fc098 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:08 -0800 Subject: smp_call_function_single(): be slightly less stupid If you do smp_call_function_single(expression-with-side-effects, ...) then expression-with-side-effects never gets evaluated on UP builds. As always, implementing it in C is the correct thing to do. While we're there, uninline it for size and possible header dependency reasons. And create a new kernel/up.c, as a place in which to put uniprocessor-specific code and storage. It should mirror kernel/smp.c. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/smp.h | 13 +++---------- kernel/Makefile | 6 +++++- kernel/up.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 11 deletions(-) create mode 100644 kernel/up.c (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index b82466968101..715196b09d67 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -24,6 +24,9 @@ struct call_single_data { /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; +int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, + int wait); + #ifdef CONFIG_SMP #include @@ -79,8 +82,6 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, return 0; } -int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, - int wait); void __smp_call_function_single(int cpuid, struct call_single_data *data); /* @@ -140,14 +141,6 @@ static inline int up_smp_call_function(void (*func)(void *), void *info) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) -#define smp_call_function_single(cpuid, func, info, wait) \ -({ \ - WARN_ON(cpuid != 0); \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - 0; \ -}) #define smp_call_function_mask(mask, func, info, wait) \ (up_smp_call_function(func, info)) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/Makefile b/kernel/Makefile index 2921d90ce32f..2aebc4cd7878 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) +obj-y += smp.o +else +obj-y += up.o +endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 000000000000..ce62cc9e9f71 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,18 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include +#include +#include + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpuid != 0); + local_irq_disable(); + (func)(info); + local_irq_enable(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From 93423b8665f43a0c7a006a1d5be048b99db56d32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 11 Jan 2009 05:15:21 +0100 Subject: smp_call_function_single(): be slightly less stupid, fix Impact: build fix on Alpha kernel/up.c: In function 'smp_call_function_single': kernel/up.c:12: error: 'cpuid' undeclared (first use in this function) kernel/up.c:12: error: (Each undeclared identifier is reported only once kernel/up.c:12: error: for each function it appears in.) The typo didnt show up on x86 because 'cpuid' happens to be a function address as well ... Signed-off-by: Ingo Molnar --- kernel/up.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index ce62cc9e9f71..c04b9dcfcebe 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -9,10 +9,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { - WARN_ON(cpuid != 0); + WARN_ON(cpu != 0); + local_irq_disable(); (func)(info); local_irq_enable(); + return 0; } EXPORT_SYMBOL(smp_call_function_single); -- cgit v1.2.3 From fd2ab30b65e961b974ae0bc71e0d47d6b35e0968 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sun, 11 Jan 2009 01:04:22 -0800 Subject: kernel/sched.c: add missing forward declaration for 'double_rq_lock' Impact: build fix on certain configs Added 'double_rq_lock' forward declaration, allowing double_rq_lock to be used in _double_lock_balance(). Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f0c0a81d7638..8be2c13b50d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch); DEFINE_TRACE(sched_migrate_task); #ifdef CONFIG_SMP + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) * Since cpu_power is a 'constant', we can use a reciprocal divide. -- cgit v1.2.3 From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Jan 2009 13:00:50 +0100 Subject: Revert "sched: improve preempt debugging" This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd. This has been reported (and bisected) by Alexey Zaytsev and Kamalesh Babulal to produce annoying warnings during bootup on both x86 and powerpc. kernel_locked() is not a valid test in IRQ context (we update the BKL's ->lock_depth and the preempt count separately and non-atomicalyy), so we cannot put it into the generic preempt debugging checks which can run in IRQ contexts too. Reported-and-bisected-by: Alexey Zaytsev Reported-and-bisected-by: Kamalesh Babulal Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..3b630d882660 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* * Is the spinlock portion underflowing? -- cgit v1.2.3 From 6e96281412f2f757abe623e08a9577e2bbd3402f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Jan 2009 16:04:37 +0100 Subject: smp_call_function_single(): be slightly less stupid, fix #2 fix m68k build failure: tip/kernel/up.c: In function 'smp_call_function_single': tip/kernel/up.c:16: error: dereferencing pointer to incomplete type make[2]: *** [kernel/up.o] Error 1 Signed-off-by: Ingo Molnar --- kernel/up.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index c04b9dcfcebe..1ff27a28bb7d 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -2,6 +2,7 @@ * Uniprocessor-only support functions. The counterpart to kernel/smp.c */ +#include #include #include #include -- cgit v1.2.3 From 37a76bd4f1b716949fc38a6842e89f0ccb8384d0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 11 Jan 2009 15:35:01 +0000 Subject: async: fix __lowest_in_progress() At 37000 feet somewhere near Greenland I woke up from a half-sleep with the realisation that __lowest_in_progress() is buggy. After landing I checked and there were indeed 2 problems with it; this patch fixes both: * The order of the list checks was wrong * The locking was not correct. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/async.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index f286e9f2b736..608b32b42812 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -90,12 +90,12 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; - if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, + if (!list_empty(running)) { + entry = list_first_entry(running, struct async_entry, list); return entry->cookie; - } else if (!list_empty(running)) { - entry = list_first_entry(running, + } else if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, struct async_entry, list); return entry->cookie; } else { @@ -104,6 +104,17 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) } } + +static async_cookie_t lowest_in_progress(struct list_head *running) +{ + unsigned long flags; + async_cookie_t ret; + + spin_lock_irqsave(&async_lock, flags); + ret = __lowest_in_progress(running); + spin_unlock_irqrestore(&async_lock, flags); + return ret; +} /* * pick the first pending entry and run it */ @@ -229,7 +240,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r starttime = ktime_get(); } - wait_event(async_done, __lowest_in_progress(running) >= cookie); + wait_event(async_done, lowest_in_progress(running) >= cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); -- cgit v1.2.3 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- fs/read_write.c | 18 +++++------ fs/xattr.c | 12 ++++---- include/linux/syscalls.h | 79 ++++++++++++++++++++++++------------------------ ipc/mqueue.c | 2 +- kernel/exit.c | 4 ++- kernel/signal.c | 2 +- kernel/timer.c | 2 +- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 11 files changed, 64 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/fs/read_write.c b/fs/read_write.c index 5cc6924eb158..940367f51f2a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(vfs_llseek); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin) +asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin) { off_t retval; struct file * file; @@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) +asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) return ret; } -asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) +asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count) { struct file *file; ssize_t ret = -EBADF; @@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co return ret; } -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, return ret; } -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct file *file; @@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, EXPORT_SYMBOL(vfs_writev); -asmlinkage ssize_t +asmlinkage long sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) return ret; } -asmlinkage ssize_t +asmlinkage long sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) { struct file *file; @@ -812,7 +812,7 @@ out: return retval; } -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) +asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) { loff_t pos; off_t off; @@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz return do_sendfile(out_fd, in_fd, NULL, count, 0); } -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) +asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) { loff_t pos; ssize_t ret; diff --git a/fs/xattr.c b/fs/xattr.c index 237804cd6b56..d049ae27aae7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return error; } -asmlinkage ssize_t +asmlinkage long sys_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name, return error; } -asmlinkage ssize_t +asmlinkage long sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size) { @@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user return error; } -asmlinkage ssize_t +asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) { struct file *f; @@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_listxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_llistxattr(const char __user *pathname, char __user *list, size_t size) { struct path path; @@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size) return error; } -asmlinkage ssize_t +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size) { struct file *f; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a7593f670ca6..22290eeaf553 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,7 +77,7 @@ asmlinkage long sys_times(struct tms __user *tbuf); asmlinkage long sys_gettid(void); asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp); -asmlinkage unsigned long sys_alarm(unsigned int seconds); +asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); asmlinkage long sys_getuid(void); @@ -166,7 +166,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long flags); asmlinkage long sys_exit(int error_code); -asmlinkage void sys_exit_group(int error_code); +asmlinkage long sys_exit_group(int error_code); asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr, int options, struct rusage __user *ru); asmlinkage long sys_waitid(int which, pid_t pid, @@ -196,7 +196,7 @@ asmlinkage long sys_tkill(int pid, int sig); asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo); asmlinkage long sys_sgetmask(void); asmlinkage long sys_ssetmask(int newmask); -asmlinkage unsigned long sys_signal(int sig, __sighandler_t handler); +asmlinkage long sys_signal(int sig, __sighandler_t handler); asmlinkage long sys_pause(void); asmlinkage long sys_sync(void); @@ -246,29 +246,29 @@ asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); -asmlinkage ssize_t sys_getxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_lgetxattr(const char __user *path, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_fgetxattr(int fd, const char __user *name, - void __user *value, size_t size); -asmlinkage ssize_t sys_listxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_llistxattr(const char __user *path, char __user *list, - size_t size); -asmlinkage ssize_t sys_flistxattr(int fd, char __user *list, size_t size); +asmlinkage long sys_getxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_fgetxattr(int fd, const char __user *name, + void __user *value, size_t size); +asmlinkage long sys_listxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_llistxattr(const char __user *path, char __user *list, + size_t size); +asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); -asmlinkage unsigned long sys_brk(unsigned long brk); +asmlinkage long sys_brk(unsigned long brk); asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot); -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); +asmlinkage long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); @@ -321,10 +321,10 @@ asmlinkage long sys_io_submit(aio_context_t, long, struct iocb __user * __user *); asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, struct io_event __user *result); -asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, - off_t __user *offset, size_t count); -asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, - loff_t __user *offset, size_t count); +asmlinkage long sys_sendfile(int out_fd, int in_fd, + off_t __user *offset, size_t count); +asmlinkage long sys_sendfile64(int out_fd, int in_fd, + loff_t __user *offset, size_t count); asmlinkage long sys_readlink(const char __user *path, char __user *buf, int bufsiz); asmlinkage long sys_creat(const char __user *pathname, int mode); @@ -368,26 +368,25 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes); -asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, - unsigned int origin); +asmlinkage long sys_lseek(unsigned int fd, off_t offset, + unsigned int origin); asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, unsigned long offset_low, loff_t __user *result, unsigned int origin); -asmlinkage ssize_t sys_read(unsigned int fd, char __user *buf, - size_t count); -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count); -asmlinkage ssize_t sys_readv(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_write(unsigned int fd, const char __user *buf, - size_t count); -asmlinkage ssize_t sys_writev(unsigned long fd, - const struct iovec __user *vec, - unsigned long vlen); -asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, - size_t count, loff_t pos); -asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, - size_t count, loff_t pos); +asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count); +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count); +asmlinkage long sys_readv(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_write(unsigned int fd, const char __user *buf, + size_t count); +asmlinkage long sys_writev(unsigned long fd, + const struct iovec __user *vec, + unsigned long vlen); +asmlinkage long sys_pread64(unsigned int fd, char __user *buf, + size_t count, loff_t pos); +asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_mkdir(const char __user *pathname, int mode); asmlinkage long sys_chdir(const char __user *filename); @@ -476,7 +475,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 23fdb8492b8e..6df028b70543 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -907,7 +907,7 @@ out: return ret; } -asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, +asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) { diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252c..fac9b040af2c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1182,9 +1182,11 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage void sys_exit_group(int error_code) +asmlinkage long sys_exit_group(int error_code) { do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; } static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) diff --git a/kernel/signal.c b/kernel/signal.c index 3152ac3b62e2..856a5479d49d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2559,7 +2559,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage unsigned long +asmlinkage long sys_signal(int sig, __sighandler_t handler) { struct k_sigaction new_sa, old_sa; diff --git a/kernel/timer.c b/kernel/timer.c index dee3f641a7a7..7b8697d7f04d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) +asmlinkage long sys_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd03662..538b75ed6236 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 749623196cb9..a970d890cb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de959aa58..5572e0825d80 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed8375c986..ee3e78927739 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.2.3 From f627a741d24f12955fa2d9f8831c3b12860635bd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:58 +0100 Subject: [CVE-2009-0029] Make sys_syslog a conditional system call Remove the -ENOSYS implementation for !CONFIG_PRINTK and use the cond_syscall infrastructure instead. Acked-by: Kyle McMartin Signed-off-by: Heiko Carstens --- kernel/printk.c | 5 ----- kernel/sys_ni.c | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 7015733793e8..e48cf33783fc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -742,11 +742,6 @@ EXPORT_SYMBOL(vprintk); #else -asmlinkage long sys_syslog(int type, char __user *buf, int len) -{ - return -ENOSYS; -} - static void call_console_drivers(unsigned start, unsigned end) { } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a23281707..27dad2967387 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -131,6 +131,7 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_syslog); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From 58fd3aa288939d3097fa04505b25c2f5e6e144d1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:03 +0100 Subject: [CVE-2009-0029] System call wrappers part 01 Signed-off-by: Heiko Carstens --- kernel/hrtimer.c | 4 ++-- kernel/sys.c | 2 +- kernel/time.c | 14 +++++++------- kernel/timer.c | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1455b7651b6b..2dc30c59c5fd 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1467,8 +1467,8 @@ out: return ret; } -asmlinkage long -sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) +SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, + struct timespec __user *, rmtp) { struct timespec tu; diff --git a/kernel/sys.c b/kernel/sys.c index 763c3c17ded3..37165e552331 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -919,7 +919,7 @@ void do_sys_times(struct tms *tms) tms->tms_cstime = cputime_to_clock_t(cstime); } -asmlinkage long sys_times(struct tms __user * tbuf) +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) { if (tbuf) { struct tms tmp; diff --git a/kernel/time.c b/kernel/time.c index 4886e3ce83a4..29511943871a 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -60,7 +60,7 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -asmlinkage long sys_time(time_t __user * tloc) +SYSCALL_DEFINE1(time, time_t __user *, tloc) { time_t i = get_seconds(); @@ -79,7 +79,7 @@ asmlinkage long sys_time(time_t __user * tloc) * architectures that need it). */ -asmlinkage long sys_stime(time_t __user *tptr) +SYSCALL_DEFINE1(stime, time_t __user *, tptr) { struct timespec tv; int err; @@ -99,8 +99,8 @@ asmlinkage long sys_stime(time_t __user *tptr) #endif /* __ARCH_WANT_SYS_TIME */ -asmlinkage long sys_gettimeofday(struct timeval __user *tv, - struct timezone __user *tz) +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) { if (likely(tv != NULL)) { struct timeval ktv; @@ -184,8 +184,8 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) return 0; } -asmlinkage long sys_settimeofday(struct timeval __user *tv, - struct timezone __user *tz) +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) { struct timeval user_tv; struct timespec new_ts; @@ -205,7 +205,7 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -asmlinkage long sys_adjtimex(struct timex __user *txc_p) +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ int ret; diff --git a/kernel/timer.c b/kernel/timer.c index 7b8697d7f04d..76041df06c57 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1129,7 +1129,7 @@ void do_timer(unsigned long ticks) * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ -asmlinkage long sys_alarm(unsigned int seconds) +SYSCALL_DEFINE1(alarm, unsigned int, seconds) { return alarm_setitimer(seconds); } @@ -1152,7 +1152,7 @@ asmlinkage long sys_alarm(unsigned int seconds) * * This is SMP safe as current->tgid does not change. */ -asmlinkage long sys_getpid(void) +SYSCALL_DEFINE0(getpid) { return task_tgid_vnr(current); } @@ -1308,7 +1308,7 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); /* Thread ID - the internal kernel "pid" */ -asmlinkage long sys_gettid(void) +SYSCALL_DEFINE0(gettid) { return task_pid_vnr(current); } -- cgit v1.2.3 From dbf040d9d1cbf1ef6250bdb095c5c118950bcde8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:04 +0100 Subject: [CVE-2009-0029] System call wrappers part 02 Signed-off-by: Heiko Carstens --- kernel/sys.c | 10 +++++----- kernel/timer.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 37165e552331..4c33555f8d95 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -756,7 +756,7 @@ error: return retval; } -asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) { const struct cred *cred = current_cred(); int retval; @@ -814,7 +814,7 @@ error: return retval; } -asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) { const struct cred *cred = current_cred(); int retval; @@ -1015,7 +1015,7 @@ out: return err; } -asmlinkage long sys_getpgid(pid_t pid) +SYSCALL_DEFINE1(getpgid, pid_t, pid) { struct task_struct *p; struct pid *grp; @@ -1045,14 +1045,14 @@ out: #ifdef __ARCH_WANT_SYS_GETPGRP -asmlinkage long sys_getpgrp(void) +SYSCALL_DEFINE0(getpgrp) { return sys_getpgid(0); } #endif -asmlinkage long sys_getsid(pid_t pid) +SYSCALL_DEFINE1(getsid, pid_t, pid) { struct task_struct *p; struct pid *sid; diff --git a/kernel/timer.c b/kernel/timer.c index 76041df06c57..14a51530a4cd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1163,7 +1163,7 @@ SYSCALL_DEFINE0(getpid) * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct). */ -asmlinkage long sys_getppid(void) +SYSCALL_DEFINE0(getppid) { int pid; @@ -1174,25 +1174,25 @@ asmlinkage long sys_getppid(void) return pid; } -asmlinkage long sys_getuid(void) +SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ return current_uid(); } -asmlinkage long sys_geteuid(void) +SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ return current_euid(); } -asmlinkage long sys_getgid(void) +SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ return current_gid(); } -asmlinkage long sys_getegid(void) +SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ return current_egid(); -- cgit v1.2.3 From ae1251ab785f6da87219df8352ffdac68bba23e4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:05 +0100 Subject: [CVE-2009-0029] System call wrappers part 03 Signed-off-by: Heiko Carstens --- kernel/sys.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 4c33555f8d95..ace9ced598b9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -478,7 +478,7 @@ void ctrl_alt_del(void) * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ -asmlinkage long sys_setregid(gid_t rgid, gid_t egid) +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { const struct cred *old; struct cred *new; @@ -529,7 +529,7 @@ error: * * SMP: Same implicit races as above. */ -asmlinkage long sys_setgid(gid_t gid) +SYSCALL_DEFINE1(setgid, gid_t, gid) { const struct cred *old; struct cred *new; @@ -597,7 +597,7 @@ static int set_user(struct cred *new) * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs. */ -asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { const struct cred *old; struct cred *new; @@ -661,7 +661,7 @@ error: * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid. */ -asmlinkage long sys_setuid(uid_t uid) +SYSCALL_DEFINE1(setuid, uid_t, uid) { const struct cred *old; struct cred *new; @@ -705,7 +705,7 @@ error: * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid(). */ -asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { const struct cred *old; struct cred *new; @@ -771,7 +771,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u /* * Same as above, but for rgid, egid, sgid. */ -asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { const struct cred *old; struct cred *new; @@ -833,7 +833,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __u * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access.. */ -asmlinkage long sys_setfsuid(uid_t uid) +SYSCALL_DEFINE1(setfsuid, uid_t, uid) { const struct cred *old; struct cred *new; @@ -870,7 +870,7 @@ change_okay: /* * Samma pÃ¥ svenska.. */ -asmlinkage long sys_setfsgid(gid_t gid) +SYSCALL_DEFINE1(setfsgid, gid_t, gid) { const struct cred *old; struct cred *new; @@ -1311,7 +1311,7 @@ int set_current_groups(struct group_info *group_info) EXPORT_SYMBOL(set_current_groups); -asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; -- cgit v1.2.3 From b290ebe2c46d01b742b948ce03f09e8a3efb9a92 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:06 +0100 Subject: [CVE-2009-0029] System call wrappers part 04 Signed-off-by: Heiko Carstens --- kernel/acct.c | 2 +- kernel/capability.c | 4 ++-- kernel/exec_domain.c | 3 +-- kernel/itimer.c | 2 +- kernel/signal.c | 7 +++---- kernel/sys.c | 6 +++--- 6 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index d57b7cbb98b6..7afa31564162 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -277,7 +277,7 @@ static int acct_on(char *name) * should be written. If the filename is NULL, accounting will be * shutdown. */ -asmlinkage long sys_acct(const char __user *name) +SYSCALL_DEFINE1(acct, const char __user *, name) { int error; diff --git a/kernel/capability.c b/kernel/capability.c index 688926e496be..4e17041963f5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -161,7 +161,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, * * Returns 0 on success and < 0 on error. */ -asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) +SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; @@ -235,7 +235,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) * * Returns 0 on success and < 0 on error. */ -asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) +SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0511716e9424..667c841c2952 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -209,8 +209,7 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -asmlinkage long -sys_personality(u_long personality) +SYSCALL_DEFINE1(personality, u_long, personality) { u_long old = current->personality; diff --git a/kernel/itimer.c b/kernel/itimer.c index db7c358b9a02..7e0663ea94fc 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -100,7 +100,7 @@ int do_getitimer(int which, struct itimerval *value) return 0; } -asmlinkage long sys_getitimer(int which, struct itimerval __user *value) +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) { int error = -EFAULT; struct itimerval get_buffer; diff --git a/kernel/signal.c b/kernel/signal.c index 856a5479d49d..3fe08eaa5dea 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2434,8 +2434,7 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING -asmlinkage long -sys_sigpending(old_sigset_t __user *set) +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); } @@ -2446,8 +2445,8 @@ sys_sigpending(old_sigset_t __user *set) /* Some platforms have their own version with special arguments others support only sys_rt_sigprocmask. */ -asmlinkage long -sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, + old_sigset_t __user *, oset) { int error; old_sigset_t old_set, new_set; diff --git a/kernel/sys.c b/kernel/sys.c index ace9ced598b9..cbe4502c28a1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -944,7 +944,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. * LBT 04.03.94 */ -asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; @@ -1080,7 +1080,7 @@ out: return retval; } -asmlinkage long sys_setsid(void) +SYSCALL_DEFINE0(setsid) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); @@ -1340,7 +1340,7 @@ out: * without another task interfering. */ -asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) +SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) { struct group_info *group_info; int retval; -- cgit v1.2.3 From 362e9c07c7220c0a78c88826fc0d2bf7e4a4bb68 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:07 +0100 Subject: [CVE-2009-0029] System call wrappers part 05 Signed-off-by: Heiko Carstens --- kernel/itimer.c | 5 ++--- kernel/posix-timers.c | 43 +++++++++++++++++++------------------------ 2 files changed, 21 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 7e0663ea94fc..6a5fe93dd8bd 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -260,9 +260,8 @@ unsigned int alarm_setitimer(unsigned int seconds) return it_old.it_value.tv_sec; } -asmlinkage long sys_setitimer(int which, - struct itimerval __user *value, - struct itimerval __user *ovalue) +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) { struct itimerval set_buffer, get_buffer; int error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 887c63787de6..052ec4d195c7 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -477,10 +477,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) /* Create a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_create(const clockid_t which_clock, - struct sigevent __user *timer_event_spec, - timer_t __user * created_timer_id) +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) { struct k_itimer *new_timer; int error, new_timer_id; @@ -661,8 +660,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) } /* Get the time remaining on a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) { struct k_itimer *timr; struct itimerspec cur_setting; @@ -691,8 +690,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) * the call back to do_schedule_next_timer(). So all we need to do is * to pick up the frozen overrun. */ -asmlinkage long -sys_timer_getoverrun(timer_t timer_id) +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; int overrun; @@ -760,10 +758,9 @@ common_timer_set(struct k_itimer *timr, int flags, } /* Set a POSIX.1b interval timer */ -asmlinkage long -sys_timer_settime(timer_t timer_id, int flags, - const struct itimerspec __user *new_setting, - struct itimerspec __user *old_setting) +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) { struct k_itimer *timr; struct itimerspec new_spec, old_spec; @@ -816,8 +813,7 @@ static inline int timer_delete_hook(struct k_itimer *timer) } /* Delete a POSIX.1b interval timer. */ -asmlinkage long -sys_timer_delete(timer_t timer_id) +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; unsigned long flags; @@ -903,8 +899,8 @@ int do_posix_clock_nonanosleep(const clockid_t clock, int flags, } EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); -asmlinkage long sys_clock_settime(const clockid_t which_clock, - const struct timespec __user *tp) +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) { struct timespec new_tp; @@ -916,8 +912,8 @@ asmlinkage long sys_clock_settime(const clockid_t which_clock, return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); } -asmlinkage long -sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) { struct timespec kernel_tp; int error; @@ -933,8 +929,8 @@ sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) } -asmlinkage long -sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) { struct timespec rtn_tp; int error; @@ -963,10 +959,9 @@ static int common_nsleep(const clockid_t which_clock, int flags, which_clock); } -asmlinkage long -sys_clock_nanosleep(const clockid_t which_clock, int flags, - const struct timespec __user *rqtp, - struct timespec __user *rmtp) +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) { struct timespec t; -- cgit v1.2.3 From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:08 +0100 Subject: [CVE-2009-0029] System call wrappers part 06 Signed-off-by: Heiko Carstens --- kernel/sched.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8be2c13b50d0..1a0fdfa5ddf9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice) * sys_setpriority is a more generic, but much slower function that * does similar things. */ -asmlinkage long sys_nice(int increment) +SYSCALL_DEFINE1(nice, int, increment) { long nice, retval; @@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @policy: new policy. * @param: structure containing the new RT priority. */ -asmlinkage long -sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) { /* negative values for policy are not valid */ if (policy < 0) @@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @pid: the pid in question. * @param: structure containing the new RT priority. */ -asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { return do_sched_setscheduler(pid, -1, param); } @@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. */ -asmlinkage long sys_sched_getscheduler(pid_t pid) +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { struct task_struct *p; int retval; @@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) * @pid: the pid in question. * @param: structure containing the RT priority. */ -asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { struct sched_param lp; struct task_struct *p; @@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -5648,8 +5648,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. */ -asmlinkage long sys_sched_yield(void) +SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); @@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout) * this syscall returns the maximum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_max(int policy) +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { int ret = -EINVAL; @@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) * this syscall returns the minimum rt_priority that can be used * by a given scheduling class. */ -asmlinkage long sys_sched_get_priority_min(int policy) +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { int ret = -EINVAL; -- cgit v1.2.3 From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:09 +0100 Subject: [CVE-2009-0029] System call wrappers part 07 Signed-off-by: Heiko Carstens --- kernel/exit.c | 8 ++++---- kernel/kexec.c | 5 ++--- kernel/sched.c | 4 ++-- kernel/signal.c | 2 +- kernel/sys.c | 7 ++++--- net/socket.c | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fac9b040af2c..08895df0eab3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1141,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) EXPORT_SYMBOL(complete_and_exit); -asmlinkage long sys_exit(int error_code) +SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } @@ -1182,7 +1182,7 @@ do_group_exit(int exit_code) * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ -asmlinkage long sys_exit_group(int error_code) +SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ @@ -1795,8 +1795,8 @@ asmlinkage long sys_waitid(int which, pid_t upid, return ret; } -asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage __user *ru) +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; diff --git a/kernel/kexec.c b/kernel/kexec.c index 3fb855ad6aa0..8a6d7b08864e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -934,9 +934,8 @@ struct kimage *kexec_crash_image; static DEFINE_MUTEX(kexec_mutex); -asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, - unsigned long flags) +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) { struct kimage **dest_image, *image; int result; diff --git a/kernel/sched.c b/kernel/sched.c index 1a0fdfa5ddf9..65c02037b052 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -asmlinkage -long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) { struct task_struct *p; unsigned int time_slice; diff --git a/kernel/signal.c b/kernel/signal.c index 3fe08eaa5dea..41f32e08615e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ -asmlinkage long sys_restart_syscall(void) +SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; return restart->fn(restart); diff --git a/kernel/sys.c b/kernel/sys.c index cbe4502c28a1..39b192b40034 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -143,7 +143,7 @@ out: return error; } -asmlinkage long sys_setpriority(int which, int who, int niceval) +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) { struct task_struct *g, *p; struct user_struct *user; @@ -208,7 +208,7 @@ out: * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible. */ -asmlinkage long sys_getpriority(int which, int who) +SYSCALL_DEFINE2(getpriority, int, which, int, who) { struct task_struct *g, *p; struct user_struct *user; @@ -355,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off); * * reboot doesn't sync: do that yourself before calling this. */ -asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) { char buffer[256]; diff --git a/net/socket.c b/net/socket.c index 06603d73c411..cc9b666e58f6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1789,7 +1789,7 @@ out_put: * Shutdown a socket. */ -asmlinkage long sys_shutdown(int fd, int how) +SYSCALL_DEFINE2(shutdown, int, fd, int, how) { int err, fput_needed; struct socket *sock; -- cgit v1.2.3 From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:10 +0100 Subject: [CVE-2009-0029] System call wrappers part 08 Signed-off-by: Heiko Carstens --- kernel/exit.c | 7 +++---- kernel/fork.c | 2 +- kernel/futex.c | 6 +++--- kernel/module.c | 10 ++++------ kernel/sched.c | 2 +- kernel/signal.c | 18 +++++++----------- 6 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 08895df0eab3..f80dec3f1875 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1754,9 +1754,8 @@ end: return retval; } -asmlinkage long sys_waitid(int which, pid_t upid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) { struct pid *pid = NULL; enum pid_type type; @@ -1833,7 +1832,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ -asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return sys_wait4(pid, stat_addr, options, NULL); } diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd8..8eb37d38c6a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -901,7 +901,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) clear_freeze_flag(p); } -asmlinkage long sys_set_tid_address(int __user *tidptr) +SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; diff --git a/kernel/futex.c b/kernel/futex.c index 002aa189eb09..e86931d8d4e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1978,9 +1978,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - struct timespec __user *utime, u32 __user *uaddr2, - u32 val3) +SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/module.c b/kernel/module.c index c9332c90d5a0..e8b51d41dd72 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -743,8 +743,8 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } -asmlinkage long -sys_delete_module(const char __user *name_user, unsigned int flags) +SYSCALL_DEFINE2(delete_module, const char __user *, name_user, + unsigned int, flags) { struct module *mod; char name[MODULE_NAME_LEN]; @@ -2296,10 +2296,8 @@ static noinline struct module *load_module(void __user *umod, } /* This is where the real work happens */ -asmlinkage long -sys_init_module(void __user *umod, - unsigned long len, - const char __user *uargs) +SYSCALL_DEFINE3(init_module, void __user *, umod, + unsigned long, len, const char __user *, uargs) { struct module *mod; int ret = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 65c02037b052..eb1931eef587 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ -SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid, +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { struct task_struct *p; diff --git a/kernel/signal.c b/kernel/signal.c index 41f32e08615e..278cc8737f17 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2014,8 +2014,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } -asmlinkage long -sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, + sigset_t __user *, oset, size_t, sigsetsize) { int error = -EINVAL; sigset_t old_set, new_set; @@ -2074,8 +2074,7 @@ out: return error; } -asmlinkage long -sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); } @@ -2146,11 +2145,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif -asmlinkage long -sys_rt_sigtimedwait(const sigset_t __user *uthese, - siginfo_t __user *uinfo, - const struct timespec __user *uts, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + siginfo_t __user *, uinfo, const struct timespec __user *, uts, + size_t, sigsetsize) { int ret, sig; sigset_t these; @@ -2223,8 +2220,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, return ret; } -asmlinkage long -sys_kill(pid_t pid, int sig) +SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; -- cgit v1.2.3 From a5f8fa9e9ba5ef3305e147f41ad6e1e84ac1f0bd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:11 +0100 Subject: [CVE-2009-0029] System call wrappers part 09 Signed-off-by: Heiko Carstens --- fs/sync.c | 6 +++--- kernel/signal.c | 21 ++++++++------------- 2 files changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/fs/sync.c b/fs/sync.c index 23ebbd72ecc9..a16d53e5fe9d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -36,7 +36,7 @@ static void do_sync(unsigned long wait) laptop_sync_completion(); } -asmlinkage long sys_sync(void) +SYSCALL_DEFINE0(sync) { do_sync(1); return 0; @@ -144,12 +144,12 @@ static int do_fsync(unsigned int fd, int datasync) return ret; } -asmlinkage long sys_fsync(unsigned int fd) +SYSCALL_DEFINE1(fsync, unsigned int, fd) { return do_fsync(fd, 0); } -asmlinkage long sys_fdatasync(unsigned int fd) +SYSCALL_DEFINE1(fdatasync, unsigned int, fd) { return do_fsync(fd, 1); } diff --git a/kernel/signal.c b/kernel/signal.c index 278cc8737f17..e2333929611a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2279,7 +2279,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ -asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) +SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -2291,8 +2291,7 @@ asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) /* * Send a signal to only one task, even if it's a CLONE_THREAD task. */ -asmlinkage long -sys_tkill(pid_t pid, int sig) +SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) @@ -2301,8 +2300,8 @@ sys_tkill(pid_t pid, int sig) return do_tkill(0, pid, sig); } -asmlinkage long -sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo) +SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) { siginfo_t info; @@ -2526,15 +2525,13 @@ out: /* * For backwards compatibility. Functionality superseded by sigprocmask. */ -asmlinkage long -sys_sgetmask(void) +SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } -asmlinkage long -sys_ssetmask(int newmask) +SYSCALL_DEFINE1(ssetmask, int, newmask) { int old; @@ -2554,8 +2551,7 @@ sys_ssetmask(int newmask) /* * For backwards compatibility. Functionality superseded by sigaction. */ -asmlinkage long -sys_signal(int sig, __sighandler_t handler) +SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; @@ -2572,8 +2568,7 @@ sys_signal(int sig, __sighandler_t handler) #ifdef __ARCH_WANT_SYS_PAUSE -asmlinkage long -sys_pause(void) +SYSCALL_DEFINE0(pause) { current->state = TASK_INTERRUPTIBLE; schedule(); -- cgit v1.2.3 From ca013e945b1ba5828b151ee646946f1297b67a4c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:19 +0100 Subject: [CVE-2009-0029] System call wrappers part 17 Signed-off-by: Heiko Carstens --- fs/open.c | 16 +++++++--------- kernel/uid16.c | 6 +++--- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/open.c b/fs/open.c index 293408b1c165..4a6d80064746 100644 --- a/fs/open.c +++ b/fs/open.c @@ -517,7 +517,7 @@ out: return res; } -asmlinkage long sys_access(const char __user *filename, int mode) +SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return sys_faccessat(AT_FDCWD, filename, mode); } @@ -688,7 +688,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) return error; } -asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group) +SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { struct path path; int error; @@ -732,7 +732,7 @@ out: return error; } -asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group) +SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { struct path path; int error; @@ -751,8 +751,7 @@ out: return error; } - -asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { struct file * file; int error = -EBADF; @@ -1048,7 +1047,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) return fd; } -asmlinkage long sys_open(const char __user *filename, int flags, int mode) +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) { long ret; @@ -1117,7 +1116,7 @@ EXPORT_SYMBOL(filp_close); * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ -asmlinkage long sys_close(unsigned int fd) +SYSCALL_DEFINE1(close, unsigned int, fd) { struct file * filp; struct files_struct *files = current->files; @@ -1150,14 +1149,13 @@ out_unlock: spin_unlock(&files->file_lock); return -EBADF; } - EXPORT_SYMBOL(sys_close); /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ -asmlinkage long sys_vhangup(void) +SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); diff --git a/kernel/uid16.c b/kernel/uid16.c index 2460c3199b5a..37f48c049a2a 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -17,7 +17,7 @@ #include -asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ @@ -25,7 +25,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi return ret; } -asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ @@ -33,7 +33,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g return ret; } -asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) +SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ -- cgit v1.2.3 From a6b42e83f249aad723589b2bdf6d1dfb2b0997c8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:20 +0100 Subject: [CVE-2009-0029] System call wrappers part 18 Signed-off-by: Heiko Carstens --- kernel/uid16.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/uid16.c b/kernel/uid16.c index 37f48c049a2a..221894e6e980 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -41,7 +41,7 @@ SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) return ret; } -asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) +SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ @@ -49,7 +49,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) return ret; } -asmlinkage long sys_setgid16(old_gid_t gid) +SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ @@ -57,7 +57,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) return ret; } -asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) +SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ @@ -65,7 +65,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) return ret; } -asmlinkage long sys_setuid16(old_uid_t uid) +SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ @@ -73,7 +73,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) return ret; } -asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) +SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); @@ -82,7 +82,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) return ret; } -asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) +SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) { const struct cred *cred = current_cred(); int retval; @@ -94,7 +94,7 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, return retval; } -asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) +SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); @@ -103,7 +103,8 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) return ret; } -asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) + +SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) { const struct cred *cred = current_cred(); int retval; @@ -115,7 +116,7 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, return retval; } -asmlinkage long sys_setfsuid16(old_uid_t uid) +SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ @@ -123,7 +124,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) return ret; } -asmlinkage long sys_setfsgid16(old_gid_t gid) +SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ -- cgit v1.2.3 From 003d7ab479168132a2b2c6700fe682b08f08ab0c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:21 +0100 Subject: [CVE-2009-0029] System call wrappers part 19 Signed-off-by: Heiko Carstens --- fs/read_write.c | 8 ++++---- fs/utimes.c | 5 +++-- kernel/uid16.c | 12 ++++++------ 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/read_write.c b/fs/read_write.c index 0671aa016b6f..fad10af59d95 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(vfs_llseek); -asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin) +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) { off_t retval; struct file * file; @@ -171,9 +171,9 @@ bad: } #ifdef __ARCH_WANT_SYS_LLSEEK -asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, - unsigned long offset_low, loff_t __user * result, - unsigned int origin) +SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, + unsigned long, offset_low, loff_t __user *, result, + unsigned int, origin) { int retval; struct file * file; diff --git a/fs/utimes.c b/fs/utimes.c index 6929e3e91d05..ee853615798a 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -24,7 +24,7 @@ * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ -asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times) +SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { struct timespec tv[2]; @@ -214,7 +214,8 @@ asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __u return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } -asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes) +SYSCALL_DEFINE2(utimes, char __user *, filename, + struct timeval __user *, utimes) { return sys_futimesat(AT_FDCWD, filename, utimes); } diff --git a/kernel/uid16.c b/kernel/uid16.c index 221894e6e980..0314501688b9 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -162,7 +162,7 @@ static int groups16_from_user(struct group_info *group_info, return 0; } -asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) +SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist) { const struct cred *cred = current_cred(); int i; @@ -185,7 +185,7 @@ out: return i; } -asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) +SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) { struct group_info *group_info; int retval; @@ -210,22 +210,22 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) return retval; } -asmlinkage long sys_getuid16(void) +SYSCALL_DEFINE0(getuid16) { return high2lowuid(current_uid()); } -asmlinkage long sys_geteuid16(void) +SYSCALL_DEFINE0(geteuid16) { return high2lowuid(current_euid()); } -asmlinkage long sys_getgid16(void) +SYSCALL_DEFINE0(getgid16) { return high2lowgid(current_gid()); } -asmlinkage long sys_getegid16(void) +SYSCALL_DEFINE0(getegid16) { return high2lowgid(current_egid()); } -- cgit v1.2.3 From 5a8a82b1d306a325d899b67715618413657efda4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:25 +0100 Subject: [CVE-2009-0029] System call wrappers part 23 Signed-off-by: Heiko Carstens --- fs/eventpoll.c | 18 +++++++++--------- fs/select.c | 8 ++++---- kernel/sys.c | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 96355d505347..ba2f9ec71192 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1110,7 +1110,7 @@ retry: /* * Open an eventpoll file descriptor. */ -asmlinkage long sys_epoll_create1(int flags) +SYSCALL_DEFINE1(epoll_create1, int, flags) { int error, fd = -1; struct eventpoll *ep; @@ -1150,7 +1150,7 @@ error_return: return fd; } -asmlinkage long sys_epoll_create(int size) +SYSCALL_DEFINE1(epoll_create, int, size) { if (size < 0) return -EINVAL; @@ -1163,8 +1163,8 @@ asmlinkage long sys_epoll_create(int size) * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ -asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, - struct epoll_event __user *event) +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) { int error; struct file *file, *tfile; @@ -1261,8 +1261,8 @@ error_return: * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ -asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, - int maxevents, int timeout) +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) { int error; struct file *file; @@ -1319,9 +1319,9 @@ error_return: * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ -asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, - int maxevents, int timeout, const sigset_t __user *sigmask, - size_t sigsetsize) +SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout, const sigset_t __user *, sigmask, + size_t, sigsetsize) { int error; sigset_t ksigmask, sigsaved; diff --git a/fs/select.c b/fs/select.c index d1651648be11..338f703403af 100644 --- a/fs/select.c +++ b/fs/select.c @@ -557,8 +557,8 @@ out_nofds: return ret; } -asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp) +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct timeval tv; @@ -854,8 +854,8 @@ static long do_restart_poll(struct restart_block *restart_block) return ret; } -asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, - long timeout_msecs) +SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, + long, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; diff --git a/kernel/sys.c b/kernel/sys.c index 39b192b40034..5292f2119da4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1406,7 +1406,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) return errno; } -asmlinkage long sys_sethostname(char __user *name, int len) +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; @@ -1430,7 +1430,7 @@ asmlinkage long sys_sethostname(char __user *name, int len) #ifdef __ARCH_WANT_SYS_GETHOSTNAME -asmlinkage long sys_gethostname(char __user *name, int len) +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { int i, errno; struct new_utsname *u; @@ -1455,7 +1455,7 @@ asmlinkage long sys_gethostname(char __user *name, int len) * Only setdomainname; getdomainname can be implemented by calling * uname() */ -asmlinkage long sys_setdomainname(char __user *name, int len) +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) { int errno; char tmp[__NEW_UTS_LEN]; -- cgit v1.2.3 From e48fbb699f82ef1e80bd7126046394d2dc9ca7e6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:26 +0100 Subject: [CVE-2009-0029] System call wrappers part 24 Signed-off-by: Heiko Carstens --- ipc/msg.c | 12 ++++++------ kernel/sys.c | 13 +++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/ipc/msg.c b/ipc/msg.c index b4eee1c6101d..2ceab7f12fcb 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -309,7 +309,7 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) return security_msg_queue_associate(msq, msgflg); } -asmlinkage long sys_msgget(key_t key, int msgflg) +SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) { struct ipc_namespace *ns; struct ipc_ops msg_ops; @@ -466,7 +466,7 @@ out_up: return err; } -asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) +SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { struct msg_queue *msq; int err, version; @@ -723,8 +723,8 @@ out_free: return err; } -asmlinkage long -sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) +SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, + int, msgflg) { long mtype; @@ -904,8 +904,8 @@ out_unlock: return msgsz; } -asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, - long msgtyp, int msgflg) +SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, + long, msgtyp, int, msgflg) { long err, mtype; diff --git a/kernel/sys.c b/kernel/sys.c index 5292f2119da4..70ffa8408cd4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1395,7 +1395,7 @@ EXPORT_SYMBOL(in_egroup_p); DECLARE_RWSEM(uts_sem); -asmlinkage long sys_newuname(struct new_utsname __user * name) +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1478,7 +1478,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) return errno; } -asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { if (resource >= RLIM_NLIMITS) return -EINVAL; @@ -1497,7 +1497,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) * Back compatibility for getrlimit. Needed for some apps. */ -asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) { struct rlimit x; if (resource >= RLIM_NLIMITS) @@ -1515,7 +1516,7 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r #endif -asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) { struct rlimit new_rlim, *old_rlim; int retval; @@ -1688,7 +1689,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } -asmlinkage long sys_getrusage(int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) @@ -1696,7 +1697,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru) return getrusage(current, who, ru); } -asmlinkage long sys_umask(int mask) +SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; -- cgit v1.2.3 From c4ea37c26a691ad0b7e86aa5884aab27830e95c9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:28 +0100 Subject: [CVE-2009-0029] System call wrappers part 26 Signed-off-by: Heiko Carstens --- drivers/pci/syscall.c | 12 ++++-------- ipc/mqueue.c | 22 +++++++++++----------- kernel/sys.c | 4 ++-- mm/swapfile.c | 4 ++-- 4 files changed, 19 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index 645d7a60e412..ec22284eed30 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -14,10 +14,8 @@ #include #include "pci.h" -asmlinkage long -sys_pciconfig_read(unsigned long bus, unsigned long dfn, - unsigned long off, unsigned long len, - void __user *buf) +SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn, + unsigned long, off, unsigned long, len, void __user *, buf) { struct pci_dev *dev; u8 byte; @@ -86,10 +84,8 @@ error: return err; } -asmlinkage long -sys_pciconfig_write(unsigned long bus, unsigned long dfn, - unsigned long off, unsigned long len, - void __user *buf) +SYSCALL_DEFINE5(pciconfig_write, unsigned long, bus, unsigned long, dfn, + unsigned long, off, unsigned long, len, void __user *, buf) { struct pci_dev *dev; u8 byte; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index faac04c85e74..54b4077fed79 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -814,9 +814,9 @@ static inline void pipelined_receive(struct mqueue_inode_info *info) sender->state = STATE_READY; } -asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, - size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) +SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, + size_t, msg_len, unsigned int, msg_prio, + const struct timespec __user *, u_abs_timeout) { struct file *filp; struct inode *inode; @@ -907,9 +907,9 @@ out: return ret; } -asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, - size_t msg_len, unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) +SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, + size_t, msg_len, unsigned int __user *, u_msg_prio, + const struct timespec __user *, u_abs_timeout) { long timeout; ssize_t ret; @@ -997,8 +997,8 @@ out: * and he isn't currently owner of notification, will be silently discarded. * It isn't explicitly defined in the POSIX. */ -asmlinkage long sys_mq_notify(mqd_t mqdes, - const struct sigevent __user *u_notification) +SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, + const struct sigevent __user *, u_notification) { int ret; struct file *filp; @@ -1123,9 +1123,9 @@ out: return ret; } -asmlinkage long sys_mq_getsetattr(mqd_t mqdes, - const struct mq_attr __user *u_mqstat, - struct mq_attr __user *u_omqstat) +SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, + const struct mq_attr __user *, u_mqstat, + struct mq_attr __user *, u_omqstat) { int ret; struct mq_attr mqstat, omqstat; diff --git a/kernel/sys.c b/kernel/sys.c index 70ffa8408cd4..59aadcdad6ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1703,8 +1703,8 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5) +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; unsigned char comm[sizeof(me->comm)]; diff --git a/mm/swapfile.c b/mm/swapfile.c index da422c47e2ee..f48b831e5e5c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1377,7 +1377,7 @@ out: return ret; } -asmlinkage long sys_swapoff(const char __user * specialfile) +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct * p = NULL; unsigned short *swap_map; @@ -1633,7 +1633,7 @@ late_initcall(max_swapfiles_check); * * The swapon system call */ -asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct * p; char *name = NULL; -- cgit v1.2.3 From 1e7bfb2134dfec37ce04fb3a4ca89299e892d10c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:29 +0100 Subject: [CVE-2009-0029] System call wrappers part 27 Signed-off-by: Heiko Carstens --- fs/exec.c | 2 +- fs/filesystems.c | 2 +- fs/nfsctl.c | 4 ++-- kernel/printk.c | 2 +- kernel/ptrace.c | 2 +- kernel/sysctl.c | 4 ++-- kernel/timer.c | 2 +- security/keys/keyctl.c | 18 +++++++++--------- 8 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 71a6efe5d8bd..0dd60a01f1b4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -99,7 +99,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt) * * Also note that we take the address to load from from the file itself. */ -asmlinkage long sys_uselib(const char __user * library) +SYSCALL_DEFINE1(uselib, const char __user *, library) { struct file *file; struct nameidata nd; diff --git a/fs/filesystems.c b/fs/filesystems.c index d488dcd7f2bb..1aa70260e6d1 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -179,7 +179,7 @@ static int fs_maxindex(void) /* * Whee.. Weird sysv syscall. */ -asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2) +SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; diff --git a/fs/nfsctl.c b/fs/nfsctl.c index b27451909dff..8f9a20556f79 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -86,8 +86,8 @@ static struct { }, }; -long -asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *res) +SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg, + void __user *, res) { struct file *file; void __user *p = &arg->u; diff --git a/kernel/printk.c b/kernel/printk.c index e48cf33783fc..69188f226a93 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -382,7 +382,7 @@ out: return error; } -asmlinkage long sys_syslog(int type, char __user *buf, int len) +SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 29dc700e198c..c9cf48b21f05 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -574,7 +574,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -asmlinkage long sys_ptrace(long request, long pid, long addr, long data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) { struct task_struct *child; long ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89d74436318c..3e38b74b6124 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1688,7 +1688,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol return error; } -asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) { struct __sysctl_args tmp; int error; @@ -2989,7 +2989,7 @@ int sysctl_ms_jiffies(struct ctl_table *table, #else /* CONFIG_SYSCTL_SYSCALL */ -asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) { struct __sysctl_args tmp; int error; diff --git a/kernel/timer.c b/kernel/timer.c index 14a51530a4cd..13dd64fe143d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1400,7 +1400,7 @@ out: return 0; } -asmlinkage long sys_sysinfo(struct sysinfo __user *info) +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) { struct sysinfo val; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 09796797d122..070a53eab80f 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -54,11 +54,11 @@ static int key_get_type_from_user(char *type, * - returns the new key's serial number * - implements add_key() */ -asmlinkage long sys_add_key(const char __user *_type, - const char __user *_description, - const void __user *_payload, - size_t plen, - key_serial_t ringid) +SYSCALL_DEFINE5(add_key, const char __user *, _type, + const char __user *, _description, + const void __user *, _payload, + size_t, plen, + key_serial_t, ringid) { key_ref_t keyring_ref, key_ref; char type[32], *description; @@ -146,10 +146,10 @@ asmlinkage long sys_add_key(const char __user *_type, * - if the _callout_info string is empty, it will be rendered as "-" * - implements request_key() */ -asmlinkage long sys_request_key(const char __user *_type, - const char __user *_description, - const char __user *_callout_info, - key_serial_t destringid) +SYSCALL_DEFINE4(request_key, const char __user *, _type, + const char __user *, _description, + const char __user *, _callout_info, + key_serial_t, destringid) { struct key_type *ktype; struct key *key; -- cgit v1.2.3 From 6559eed8ca7db0531a207cd80be5e28cd6f213c5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:32 +0100 Subject: [CVE-2009-0029] System call wrappers part 30 Signed-off-by: Heiko Carstens --- fs/open.c | 13 ++++++------- fs/stat.c | 12 ++++++------ fs/utimes.c | 6 ++++-- kernel/fork.c | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/fs/open.c b/fs/open.c index bc49e3c388d9..a3a78ceb2a2b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -447,7 +447,7 @@ SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { const struct cred *old_cred; struct cred *override_cred; @@ -628,8 +628,7 @@ out: return err; } -asmlinkage long sys_fchmodat(int dfd, const char __user *filename, - mode_t mode) +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) { struct path path; struct inode *inode; @@ -707,8 +706,8 @@ out: return error; } -asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, - gid_t group, int flag) +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) { struct path path; int error = -EINVAL; @@ -1060,8 +1059,8 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) return ret; } -asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, - int mode) +SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, + int, mode) { long ret; diff --git a/fs/stat.c b/fs/stat.c index d712a0dfb50f..2db740a0cfb5 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -260,8 +260,8 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf } #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) -asmlinkage long sys_newfstatat(int dfd, char __user *filename, - struct stat __user *statbuf, int flag) +SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename, + struct stat __user *, statbuf, int, flag) { struct kstat stat; int error = -EINVAL; @@ -293,8 +293,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) return error; } -asmlinkage long sys_readlinkat(int dfd, const char __user *pathname, - char __user *buf, int bufsiz) +SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, + char __user *, buf, int, bufsiz) { struct path path; int error; @@ -400,8 +400,8 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf) return error; } -asmlinkage long sys_fstatat64(int dfd, char __user *filename, - struct stat64 __user *statbuf, int flag) +SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, + struct stat64 __user *, statbuf, int, flag) { struct kstat stat; int error = -EINVAL; diff --git a/fs/utimes.c b/fs/utimes.c index ee853615798a..e4c75db5d373 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -170,7 +170,8 @@ out: return error; } -asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags) +SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename, + struct timespec __user *, utimes, int, flags) { struct timespec tstimes[2]; @@ -187,7 +188,8 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __ return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } -asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes) +SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename, + struct timeval __user *, utimes) { struct timeval times[2]; struct timespec tstimes[2]; diff --git a/kernel/fork.c b/kernel/fork.c index 8eb37d38c6a4..bf0cef8bbdf2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1603,7 +1603,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp * constructed. Here we are modifying the current, active, * task_struct. */ -asmlinkage long sys_unshare(unsigned long unshare_flags) +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; -- cgit v1.2.3 From 836f92adf121f806e9beb5b6b88bd5c9c4ea3f24 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:33 +0100 Subject: [CVE-2009-0029] System call wrappers part 31 Signed-off-by: Heiko Carstens --- fs/signalfd.c | 8 ++++---- fs/splice.c | 12 ++++++------ fs/timerfd.c | 8 ++++---- kernel/futex.c | 11 +++++------ kernel/sys.c | 4 ++-- 5 files changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/fs/signalfd.c b/fs/signalfd.c index 9c39bc7f8431..b07565c94386 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -205,8 +205,8 @@ static const struct file_operations signalfd_fops = { .read = signalfd_read, }; -asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, - size_t sizemask, int flags) +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) { sigset_t sigmask; struct signalfd_ctx *ctx; @@ -259,8 +259,8 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, return ufd; } -asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, - size_t sizemask) +SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask) { return sys_signalfd4(ufd, user_mask, sizemask, 0); } diff --git a/fs/splice.c b/fs/splice.c index a54b3e3f10a7..4ed0ba44a966 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1435,8 +1435,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, - unsigned long nr_segs, unsigned int flags) +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) { struct file *file; long error; @@ -1461,9 +1461,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, return error; } -asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, - int fd_out, loff_t __user *off_out, - size_t len, unsigned int flags) +SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) { long error; struct file *in, *out; @@ -1685,7 +1685,7 @@ static long do_tee(struct file *in, struct file *out, size_t len, return ret; } -asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct file *in; int error, fput_in; diff --git a/fs/timerfd.c b/fs/timerfd.c index 0862f0e49d0c..c8c14f58b96f 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -177,7 +177,7 @@ static struct file *timerfd_fget(int fd) return file; } -asmlinkage long sys_timerfd_create(int clockid, int flags) +SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; @@ -208,9 +208,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) return ufd; } -asmlinkage long sys_timerfd_settime(int ufd, int flags, - const struct itimerspec __user *utmr, - struct itimerspec __user *otmr) +SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, + const struct itimerspec __user *, utmr, + struct itimerspec __user *, otmr) { struct file *file; struct timerfd_ctx *ctx; diff --git a/kernel/futex.c b/kernel/futex.c index e86931d8d4e9..f89d373a9c6d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1733,9 +1733,8 @@ pi_faulted: * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ -asmlinkage long -sys_set_robust_list(struct robust_list_head __user *head, - size_t len) +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, + size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -1756,9 +1755,9 @@ sys_set_robust_list(struct robust_list_head __user *head, * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ -asmlinkage long -sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, - size_t __user *len_ptr) +SYSCALL_DEFINE3(get_robust_list, int, pid, + struct robust_list_head __user * __user *, head_ptr, + size_t __user *, len_ptr) { struct robust_list_head __user *head; unsigned long ret; diff --git a/kernel/sys.c b/kernel/sys.c index 59aadcdad6ce..e7dc0e10a485 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1817,8 +1817,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } -asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, - struct getcpu_cache __user *unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); -- cgit v1.2.3 From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:34 +0100 Subject: [CVE-2009-0029] System call wrappers part 32 Signed-off-by: Heiko Carstens --- fs/eventfd.c | 5 ++--- fs/pipe.c | 2 +- fs/readdir.c | 3 ++- fs/select.c | 11 ++++++----- fs/timerfd.c | 2 +- include/linux/syscalls.h | 7 +++++++ kernel/signal.c | 11 +++++------ 7 files changed, 24 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/fs/eventfd.c b/fs/eventfd.c index 08bf558d0408..5de2c2db3aa2 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd) return file; } -asmlinkage long sys_eventfd2(unsigned int count, int flags) +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd; struct eventfd_ctx *ctx; @@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags) return fd; } -asmlinkage long sys_eventfd(unsigned int count) +SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); } - diff --git a/fs/pipe.c b/fs/pipe.c index 0c64db86c919..b89c878588a9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1043,7 +1043,7 @@ int do_pipe(int *fd) * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage long sys_pipe2(int __user *fildes, int flags) +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { int fd[2]; int error; diff --git a/fs/readdir.c b/fs/readdir.c index cf6a0e39819a..7723401f8d8b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -102,7 +102,8 @@ efault: return -EFAULT; } -asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) +SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct file * file; diff --git a/fs/select.c b/fs/select.c index 338f703403af..0fe0e1469df3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -636,8 +636,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ -asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, void __user *sig) +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) { size_t sigsetsize = 0; sigset_t __user *up = NULL; @@ -889,9 +890,9 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } #ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec __user *tsp, const sigset_t __user *sigmask, - size_t sigsetsize) +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts, end_time, *to = NULL; diff --git a/fs/timerfd.c b/fs/timerfd.c index c8c14f58b96f..6a123b8ff3f5 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -265,7 +265,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return 0; } -asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) +SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) { struct file *file; struct timerfd_ctx *ctx; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 90aa5eba87a2..56c400138b05 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -678,6 +678,13 @@ asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); +asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, + fd_set __user *, struct timespec __user *, + void __user *); +asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, + struct timespec __user *, const sigset_t __user *, + size_t); +asmlinkage long sys_pipe2(int __user *, int); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); diff --git a/kernel/signal.c b/kernel/signal.c index e2333929611a..e73759783dc8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,11 +2491,10 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION -asmlinkage long -sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize) +SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret = -EINVAL; @@ -2578,7 +2577,7 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) +SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.2.3 From 9316fcacb89c59fe556c48587ac02cd7f5d38045 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Jan 2009 09:35:44 -0800 Subject: kernel/up.c: omit it if SMP=y, USE_GENERIC_SMP_HELPERS=n Fix the sparc build - we were including `up.o' on SMP builds, when CONFIG_USE_GENERIC_SMP_HELPERS=n. Tested-by: Robert Reif Fixed-by: Robert Reif Cc: David Miller Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2aebc4cd7878..170a9213c1b6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,9 +40,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) -obj-y += smp.o -else +obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_SMP) += spinlock.o -- cgit v1.2.3 From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2009 10:56:32 +0100 Subject: sched: fix bandwidth validation for UID grouping Impact: make rt-limit tunables work again Mark Glines reported: > I've got an issue on x86-64 where I can't configure the system to allow > RT tasks for a non-root user. > > In 2.6.26.5, I was able to do the following to set things up nicely: > echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime > echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime > > Seems like every value I try to echo into the /sys files returns EINVAL. For UID grouping we initialize the root group with infinite bandwidth which by default is actually more than the global limit, therefore the bandwidth check always fails. Because the root group is a phantom group (for UID grouping) we cannot runtime adjust it, therefore we let it reflect the global bandwidth settings. Reported-by: Mark Glines Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3b630d882660..ed62d1cee05c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } +#ifdef CONFIG_USER_SCHED + if (tg == &root_task_group) { + period = global_rt_period(); + runtime = global_rt_runtime(); + } +#endif + /* * Cannot have more runtime than the period. */ -- cgit v1.2.3 From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:37 +0100 Subject: sched: SCHED_IDLE weight change Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable vruntime numbers. time advanced in 100ms: weight=2 64765.988352 67012.881408 88501.412352 weight=3 35496.181411 34130.971298 35497.411573 Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ed62d1cee05c..6acfb3c2398b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) * slice expiry etc. */ -#define WEIGHT_IDLEPRIO 2 -#define WMULT_IDLEPRIO (1 << 31) +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 /* * Nice levels are multiplicative, with a gentle 10% change for every -- cgit v1.2.3 From 6bc912b71b6f33b041cfde93ca3f019cbaa852bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:38 +0100 Subject: sched: SCHED_OTHER vs SCHED_IDLE isolation Stronger SCHED_IDLE isolation: - no SCHED_IDLE buddies - never let SCHED_IDLE preempt on wakeup - always preempt SCHED_IDLE on wakeup - limit SLEEPER fairness for SCHED_IDLE. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8e1352c75557..cdebd8089cb0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -677,9 +677,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) unsigned long thresh = sysctl_sched_latency; /* - * convert the sleeper threshold into virtual time + * Convert the sleeper threshold into virtual time. + * SCHED_IDLE is a special sub-class. We care about + * fairness only relative to other SCHED_IDLE tasks, + * all of which have the same weight. */ - if (sched_feat(NORMALIZED_SLEEPER)) + if (sched_feat(NORMALIZED_SLEEPER) && + task_of(se)->policy != SCHED_IDLE) thresh = calc_delta_fair(thresh, se); vruntime -= thresh; @@ -1340,14 +1344,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) { - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; + } } /* @@ -1393,12 +1401,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; /* - * Batch tasks do not preempt (their preemption is driven by + * Batch and idle tasks do not preempt (their preemption is driven by * the tick): */ - if (unlikely(p->policy == SCHED_BATCH)) + if (unlikely(p->policy != SCHED_NORMAL)) return; + /* Idle tasks are by definition preempted by everybody. */ + if (unlikely(curr->policy == SCHED_IDLE)) { + resched_task(curr); + return; + } + if (!sched_feat(WAKEUP_PREEMPT)) return; -- cgit v1.2.3 From e17036dac189dd034c092a91df56aa740db7146d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jan 2009 14:53:39 +0100 Subject: sched: fix update_min_vruntime Impact: fix SCHED_IDLE latency problems OK, so we have 1 running task A (which is obviously curr and the tree is equally obviously empty). 'A' nicely chugs along, doing its thing, carrying min_vruntime along as it goes. Then some whacko speed freak SCHED_IDLE task gets inserted due to SMP balancing, which is very likely far right, in that case update_curr update_min_vruntime cfs_rq->rb_leftmost := true (the crazy task sitting in a tree) vruntime = se->vruntime and voila, min_vruntime is waaay right of where it ought to be. OK, so why did I write it like that to begin with... Aah, yes. Say we've just dequeued current schedule deactivate_task(prev) dequeue_entity update_min_vruntime Then we'll set vruntime = cfs_rq->min_vruntime; we find !cfs_rq->curr, but do find someone in the tree. Then we _must_ do vruntime = se->vruntime, because vruntime = min_vruntime(vruntime := cfs_rq->min_vruntime, se->vruntime) will not advance vruntime, and cause lags the other way around (which we fixed with that initial patch: 1af5f730fc1bf7c62ec9fb2d307206e18bf40a69 (sched: more accurate min_vruntime accounting). Signed-off-by: Peter Zijlstra Tested-by: Mike Galbraith Acked-by: Mike Galbraith Cc: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cdebd8089cb0..16b419bb8b0a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -283,7 +283,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) struct sched_entity, run_node); - if (vruntime == cfs_rq->min_vruntime) + if (!cfs_rq->curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); -- cgit v1.2.3 From 88fc241f54459ac3d86c5e13b449730199f66061 Mon Sep 17 00:00:00 2001 From: Doug Chapman Date: Thu, 15 Jan 2009 10:38:56 -0800 Subject: [IA64] dump stack on kernel unaligned warnings Often the cause of kernel unaligned access warnings is not obvious from just the ip displayed in the warning. This adds the option via proc to dump the stack in addition to the warning. The default is off (just display the 1 line warning). To enable the stack to be shown: echo 1 > /proc/sys/kernel/unaligned-dump-stack Signed-off-by: Doug Chapman Signed-off-by: Tony Luck --- arch/ia64/kernel/unaligned.c | 6 +++++- kernel/sysctl.c | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index ff0e7c10faa7..6db08599ebbc 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -59,6 +59,7 @@ dump (const char *str, void *vp, size_t len) * (i.e. don't allow attacker to fill up logs with unaligned accesses). */ int no_unaligned_warning; +int unaligned_dump_stack; static int noprint_warning; /* @@ -1371,9 +1372,12 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) } } } else { - if (within_logging_rate_limit()) + if (within_logging_rate_limit()) { printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n", ifa, regs->cr_iip + ipsr->ri); + if (unaligned_dump_stack) + dump_stack(); + } set_fs(KERNEL_DS); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e38b74b6124..368d1638ee78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -144,6 +144,7 @@ extern int acct_parm[]; #ifdef CONFIG_IA64 extern int no_unaligned_warning; +extern int unaligned_dump_stack; #endif #ifdef CONFIG_RT_MUTEXES @@ -781,6 +782,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #ifdef CONFIG_DETECT_SOFTLOCKUP { -- cgit v1.2.3 From 6272d68cc6a5f90c6b1a2228cf0f67b895305d17 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 15 Jan 2009 17:17:15 +0100 Subject: sched: sched_slice() fixlet Mike's change: 0a582440f "sched: fix sched_slice())" broke group scheduling by forgetting to reload cfs_rq on each loop. This patch fixes aim7 regression and specjbb2005 regression becomes less than 1.5% on 8-core stokley. Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra Tested-by: Jayson King Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 16b419bb8b0a..5cc1c162044f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -429,7 +429,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); for_each_sched_entity(se) { - struct load_weight *load = &cfs_rq->load; + struct load_weight *load; + + cfs_rq = cfs_rq_of(se); + load = &cfs_rq->load; if (unlikely(!se->on_rq)) { struct load_weight lw = cfs_rq->load; -- cgit v1.2.3 From 45ce80fb6b6f9594d1396d44dd7e7c02d596fef8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:50:59 -0800 Subject: cgroups: consolidate cgroup documents Move Documentation/cpusets.txt and Documentation/controllers/* to Documentation/cgroups/ Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 5 +- Documentation/cgroups/cpuacct.txt | 32 + Documentation/cgroups/cpusets.txt | 808 +++++++++++++++++++++++++ Documentation/cgroups/devices.txt | 52 ++ Documentation/cgroups/memcg_test.txt | 342 +++++++++++ Documentation/cgroups/memory.txt | 399 ++++++++++++ Documentation/cgroups/resource_counter.txt | 181 ++++++ Documentation/controllers/cpuacct.txt | 32 - Documentation/controllers/devices.txt | 52 -- Documentation/controllers/memcg_test.txt | 342 ----------- Documentation/controllers/memory.txt | 399 ------------ Documentation/controllers/resource_counter.txt | 181 ------ Documentation/cpusets.txt | 808 ------------------------- Documentation/scheduler/sched-design-CFS.txt | 2 +- include/linux/res_counter.h | 2 +- init/Kconfig | 9 +- kernel/cpuset.c | 2 +- 17 files changed, 1824 insertions(+), 1824 deletions(-) create mode 100644 Documentation/cgroups/cpuacct.txt create mode 100644 Documentation/cgroups/cpusets.txt create mode 100644 Documentation/cgroups/devices.txt create mode 100644 Documentation/cgroups/memcg_test.txt create mode 100644 Documentation/cgroups/memory.txt create mode 100644 Documentation/cgroups/resource_counter.txt delete mode 100644 Documentation/controllers/cpuacct.txt delete mode 100644 Documentation/controllers/devices.txt delete mode 100644 Documentation/controllers/memcg_test.txt delete mode 100644 Documentation/controllers/memory.txt delete mode 100644 Documentation/controllers/resource_counter.txt delete mode 100644 Documentation/cpusets.txt (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index e33ee74eee77..d9e5d6f41b92 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -1,7 +1,8 @@ CGROUPS ------- -Written by Paul Menage based on Documentation/cpusets.txt +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt Original copyright statements from cpusets.txt: Portions Copyright (C) 2004 BULL SA. @@ -68,7 +69,7 @@ On their own, the only use for cgroups is for simple job tracking. The intention is that other subsystems hook into the generic cgroup support to provide new attributes for cgroups, such as accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cpusets.txt) allows +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allows you to associate a set of CPUs and a set of memory nodes with the tasks in each cgroup. diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt new file mode 100644 index 000000000000..bb775fbe43d7 --- /dev/null +++ b/Documentation/cgroups/cpuacct.txt @@ -0,0 +1,32 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mkdir /cgroups +# mount -t cgroup -ocpuacct none /cgroups + +With the above step, the initial or the parent accounting group +becomes visible at /cgroups. At bootup, this group includes all the +tasks in the system. /cgroups/tasks lists the tasks in this cgroup. +/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by +this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /cgroups. + +# cd /cgroups +# mkdir g1 +# echo $$ > g1 + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/cgroups/cpuacct.usage also. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt new file mode 100644 index 000000000000..5c86c258c791 --- /dev/null +++ b/Documentation/cgroups/cpusets.txt @@ -0,0 +1,808 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a tasks current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that tasks cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that tasks cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendents) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that tasks cpuset. + - in sched.c migrate_all_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that tasks cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the tasks cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpus: list of CPUs in that cpuset + - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes + - cpu_exclusive flag: is cpu placement exclusive? + - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled + - memory_pressure: measure of how much paging pressure in cpuset + +In addition, the root cpuset only has the following file: + - memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_map using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendent, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'memory_spread_page' and +'memory_spread_slab'. + +If the per-cpuset boolean flag file 'memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the tasks NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the tasks NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing tasks memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'memory_spread_page' turns on a per-process flag +PF_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PF_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'memory_spread_slab' turns on the flag +PF_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current tasks mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +everytime. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searchs all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + ( 4 : search nodes in a chunk of node [on NUMA system] ) + ( 5 : search system wide [on NUMA system] ) + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'sched_load_balance' of a cpuset +is disabled, then 'sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not will be depend on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the tasks cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its numa placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the tasks +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +but the processor placement is not updated, until that tasks pid is +rewritten to the 'tasks' file of its cpuset. This is done to avoid +impacting the scheduler code in the kernel with a check for changes +in a tasks processor placement. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the tasks prior cpuset, or in the cpusets +prior 'mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current tasks cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /dev/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /dev/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /dev/cpuset + cd /dev/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +In the future, a C library interface to cpusets will likely be +available. For now, the only way to query or modify cpusets is +via the cpuset file system, using the various cd, mkdir, echo, cat, +rmdir commands from the shell, or their equivalent from C. + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /dev/cpuset + +Then under /dev/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /dev/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /dev/cpuset: +# cd /dev/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cpu_exclusive memory_migrate mems tasks +cpus memory_pressure notify_on_release +mem_exclusive memory_spread_page sched_load_balance +mem_hardwall memory_spread_slab sched_relax_domain_level + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpus + +Add some mems: +# /bin/echo 0-7 > mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' +# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt new file mode 100644 index 000000000000..7cc6e6a60672 --- /dev/null +++ b/Documentation/cgroups/devices.txt @@ -0,0 +1,52 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /cgroups/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt new file mode 100644 index 000000000000..19533f93b7a2 --- /dev/null +++ b/Documentation/cgroups/memcg_test.txt @@ -0,0 +1,342 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2008/12/15 +Base Kernel Version: based on 2.6.28-rc8-mm. + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_newpage_charge() + Called at new page fault and Copy-On-Write. + + mem_cgroup_try_charge_swapin() + Called at do_swap_page() (page fault on swap entry) and swapoff. + Followed by charge-commit-cancel protocol. (With swap accounting) + At commit, a charge recorded in swap_cgroup is removed. + + mem_cgroup_cache_charge() + Called at add_to_page_cache() + + mem_cgroup_cache_charge_swapin() + Called at shmem's swapin. + + mem_cgroup_prepare_migration() + Called before migration. "extra" charge is done and followed by + charge-commit-cancel protocol. + At commit, charge against oldpage or newpage will be committed. + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge_page() + Called when an anonymous page is fully unmapped. I.e., mapcount goes + to 0. If the page is SwapCache, uncharge is delayed until + mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_cache_page() + Called when a page-cache is deleted from radix-tree. If the page is + SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_swapcache() + Called when SwapCache is removed from radix-tree. The charge itself + is moved to swap_cgroup. (If mem+swap controller is disabled, no + charge to swap occurs.) + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + + mem_cgroup_end_migration(old, new) + At success of migration old is uncharged (if necessary), a charge + to new page is committed. At failure, charge to old page is committed. + +3. charge-commit-cancel + In some case, we can't know this "charge" is valid or not at charging + (because of races). + To handle such case, there are charge-commit-cancel functions. + mem_cgroup_try_charge_XXX + mem_cgroup_commit_charge_XXX + mem_cgroup_cancel_charge_XXX + these are used in swap-in and migration. + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the function checks the page should be charged or not + and set flags or avoid charging.(usage -= PAGE_SIZE) + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + It is charged right after it's allocated before doing any page table + related operations. Of course, it's uncharged when another page is used + for the fault address. + + At freeing anonymous page (by exit() or munmap()), zap_pte() is called + and pages for ptes are freed one by one.(see mm/memory.c). Uncharges + are done at page_remove_rmap() when page_mapcount() goes down to 0. + + Another page freeing is by page-reclaim (vmscan.c) and anonymous + pages are swapped out. In this case, the page is marked as + PageSwapCache(). uncharge() routine doesn't uncharge the page marked + as SwapCache(). It's delayed until __delete_from_swap_cache(). + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + This swap-in is one of the most complicated work. In do_swap_page(), + following events occur when pte is unchanged. + + (1) the page (SwapCache) is looked up. + (2) lock_page() + (3) try_charge_swapin() + (4) reuse_swap_page() (may call delete_swap_cache()) + (5) commit_charge_swapin() + (6) swap_free(). + + Considering following situation for example. + + (A) The page has not been charged before (2) and reuse_swap_page() + doesn't call delete_from_swap_cache(). + (B) The page has not been charged before (2) and reuse_swap_page() + calls delete_from_swap_cache(). + (C) The page has been charged before (2) and reuse_swap_page() doesn't + call delete_from_swap_cache(). + (D) The page has been charged before (2) and reuse_swap_page() calls + delete_from_swap_cache(). + + memory.usage/memsw.usage changes to this page/swp_entry will be + Case (A) (B) (C) (D) + Event + Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 + =========================================== + (3) +1/+1 +1/+1 +1/+1 +1/+1 + (4) - 0/ 0 - -1/ 0 + (5) 0/-1 0/ 0 -1/-1 0/ 0 + (6) - 0/-1 - 0/-1 + =========================================== + Result 1/ 1 1/ 1 1/ 1 1/ 1 + + In any cases, charges to this page should be 1/ 1. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + At (b), the page is marked as SwapCache and not uncharged. + At (d), the page is removed from SwapCache and a charge in page_cgroup + is moved to swap_cgroup. + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + Here, a charge in swap_cgroup disappears. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + uncharged at + - __remove_from_page_cache(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + Memcg's charge/uncharge have special handlers of shmem. The best way + to understand shmem's page state transition is to read mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + It's uncharged when + - A page is removed from radix-tree and not SwapCache. + - When SwapCache is removed, a charge is moved to swap_cgroup. + - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup + disappears. + +7. Page Migration + One of the most complicated functions is page-migration-handler. + Memcg has 2 routines. Assume that we are migrating a page's contents + from OLDPAGE to NEWPAGE. + + Usual migration logic is.. + (a) remove the page from LRU. + (b) allocate NEWPAGE (migration target) + (c) lock by lock_page(). + (d) unmap all mappings. + (e-1) If necessary, replace entry in radix-tree. + (e-2) move contents of a page. + (f) map all mappings again. + (g) pushback the page to LRU. + (-) OLDPAGE will be freed. + + Before (g), memcg should complete all necessary charge/uncharge to + NEWPAGE/OLDPAGE. + + The point is.... + - If OLDPAGE is anonymous, all charges will be dropped at (d) because + try_to_unmap() drops all mapcount and the page will not be + SwapCache. + + - If OLDPAGE is SwapCache, charges will be kept at (g) because + __delete_from_swap_cache() isn't called at (e-1) + + - If OLDPAGE is page-cache, charges will be kept at (g) because + remove_from_swap_cache() isn't called at (e-1) + + memcg provides following hooks. + + - mem_cgroup_prepare_migration(OLDPAGE) + Called after (b) to account a charge (usage += PAGE_SIZE) against + memcg which OLDPAGE belongs to. + + - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) + Called after (f) before (g). + If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already + charged, a charge by prepare_migration() is automatically canceled. + If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. + + But zap_pte() (by exit or munmap) can be called while migration, + we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). + +8. LRU + Each memcg has its own private LRU. Now, it's handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt new file mode 100644 index 000000000000..e1501964df1e --- /dev/null +++ b/Documentation/cgroups/memory.txt @@ -0,0 +1,399 @@ +Memory Resource Controller + +NOTE: The Memory Resource Controller has been generically been referred +to as the memory controller in this document. Do not confuse memory controller +used here with the memory controller that is used in hardware. + +Salient features + +a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages +b. The infrastructure allows easy addition of other types of memory to control +c. Provides *zero overhead* for non memory controller users +d. Provides a double LRU: global memory pressure causes reclaim from the + global LRU; a cgroup on hitting a limit, reclaims from the per + cgroup LRU + +NOTE: Swap Cache (unmapped) is not accounted now. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with limited amount of memory, this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases, find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the res_counter. The res_counter +tracks the current memory usage and limit of the group of processes associated +with the controller. Each cgroup has a memory controller specific data +structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (res_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge() is invoked to setup +the necessary data structures and check if the cgroup that is being charged +is over its limit. If it is then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +allocated and associated with the page. This routine also adds the page to +the per cgroup LRU. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +(some pages which never be reclaimable and will not be on global LRU + are not accounted. we just accounts pages under usual vm management.) + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +A RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. + +At page migration, accounting information is kept. + +Note: we just account pages-on-lru because our purpose is to control amount +of used pages. not-on-lru pages are tend to be out-of-control from vm view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + + +2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +usage of mem+swap is limited by memsw.limit_in_bytes. + +Note: why 'mem+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +mem+swap. + +In other words, when we want to limit the usage of swap without affecting +global LRU, mem+swap limit is better than just limiting swap from OS point +of view. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU that consists of an active +and inactive list. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per cgroup LRU +list. + +2. Locking + +The memory controller uses the following hierarchy + +1. zone->lru_lock is used for selecting pages to be isolated +2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) +3. lock_page_cgroup() is used to protect page->page_cgroup + +3. User Interface + +0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_RESOURCE_COUNTERS +c. Enable CONFIG_CGROUP_MEM_RES_CTLR + +1. Prepare the cgroups +# mkdir -p /cgroups +# mount -t cgroup none /cgroups -o memory + +2. Make the new group and move bash into it +# mkdir /cgroups/0 +# echo $$ > /cgroups/0/tasks + +Since now we're in the 0 cgroup, +We can alter the memory limit: +# echo 4M > /cgroups/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. + +# cat /cgroups/0/memory.limit_in_bytes +4194304 + +NOTE: The interface has now changed to display the usage in bytes +instead of pages + +We can check the usage: +# cat /cgroups/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful set of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. +Apart from that v6 has been tested with several applications and regular +daily use. The controller has also been tested on the PPC64, x86_64 and +UML platforms. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +4.2 Task migration + +When a task migrates from one cgroup to another, it's charge is not +carried forward. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. +Such charges are freed(at default) or moved to its parent. When moved, +both of RSS and CACHES are moved to parent. +If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + You can use this interface only when the cgroup has no tasks. + When writing anything to this + + # echo 0 > memory.force_empty + + Almost all pages tracked by this memcg will be unmapped and freed. Some of + pages cannot be freed because it's locked or in-use. Such pages are moved + to parent and this cgroup will be empty. But this may return -EBUSY in + some too busy case. + + Typical use case of this interface is that calling this before rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + +5.2 stat file + memory.stat file includes following statistics (now) + cache - # of pages from page-cache and shmem. + rss - # of pages from anonymous memory. + pgpgin - # of event of charging + pgpgout - # of event of uncharging + active_anon - # of pages on active lru of anon, shmem. + inactive_anon - # of pages on active lru of anon, shmem + active_file - # of pages on active lru of file-cache + inactive_file - # of pages on inactive lru of file cache + unevictable - # of pages cannot be reclaimed.(mlocked etc) + + Below is depend on CONFIG_DEBUG_VM. + inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) + recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + + Memo: + recent_rotated means recent frequency of lru rotation. + recent_scanned means recent # of scans to lru. + showing for better debug please see the code for meanings. + + +5.3 swappiness + Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. + + Following cgroup's swapiness can't be changed. + - root cgroup (uses /proc/sys/vm/swappiness). + - a cgroup which uses hierarchy and it has child cgroup. + - a cgroup which uses hierarchy and not the root of hierarchy. + + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +The memory controller by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if the cgroup already has other +cgroups created below it. + +NOTE2: This feature can be enabled/disabled per subtree. + +7. TODO + +1. Add support for accounting huge pages (as a separate controller) +2. Make per-cgroup scanner reclaim not-shared pages first +3. Teach controller to account for shared-pages +4. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt new file mode 100644 index 000000000000..f196ac1d7d25 --- /dev/null +++ b/Documentation/cgroups/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_ + max_usage max_usage_in_ + limit limit_in_ + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt deleted file mode 100644 index bb775fbe43d7..000000000000 --- a/Documentation/controllers/cpuacct.txt +++ /dev/null @@ -1,32 +0,0 @@ -CPU Accounting Controller -------------------------- - -The CPU accounting controller is used to group tasks using cgroups and -account the CPU usage of these groups of tasks. - -The CPU accounting controller supports multi-hierarchy groups. An accounting -group accumulates the CPU usage of all of its child groups and the tasks -directly present in its group. - -Accounting groups can be created by first mounting the cgroup filesystem. - -# mkdir /cgroups -# mount -t cgroup -ocpuacct none /cgroups - -With the above step, the initial or the parent accounting group -becomes visible at /cgroups. At bootup, this group includes all the -tasks in the system. /cgroups/tasks lists the tasks in this cgroup. -/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by -this group which is essentially the CPU time obtained by all the tasks -in the system. - -New accounting groups can be created under the parent group /cgroups. - -# cd /cgroups -# mkdir g1 -# echo $$ > g1 - -The above steps create a new group g1 and move the current shell -process (bash) into it. CPU time consumed by this bash and its children -can be obtained from g1/cpuacct.usage and the same is accumulated in -/cgroups/cpuacct.usage also. diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt deleted file mode 100644 index 7cc6e6a60672..000000000000 --- a/Documentation/controllers/devices.txt +++ /dev/null @@ -1,52 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. However -when a device access is removed from a parent it will not also be -removed from the child(ren). - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /cgroups/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /cgroups/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /cgroups/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendent of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. diff --git a/Documentation/controllers/memcg_test.txt b/Documentation/controllers/memcg_test.txt deleted file mode 100644 index 08d4d3ea0d79..000000000000 --- a/Documentation/controllers/memcg_test.txt +++ /dev/null @@ -1,342 +0,0 @@ -Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2008/12/15 -Base Kernel Version: based on 2.6.28-rc8-mm. - -Because VM is getting complex (one of reasons is memcg...), memcg's behavior -is complex. This is a document for memcg's internal behavior. -Please note that implementation details can be changed. - -(*) Topics on API should be in Documentation/controllers/memory.txt) - -0. How to record usage ? - 2 objects are used. - - page_cgroup ....an object per page. - Allocated at boot or memory hotplug. Freed at memory hot removal. - - swap_cgroup ... an entry per swp_entry. - Allocated at swapon(). Freed at swapoff(). - - The page_cgroup has USED bit and double count against a page_cgroup never - occurs. swap_cgroup is used only when a charged page is swapped-out. - -1. Charge - - a page/swp_entry may be charged (usage += PAGE_SIZE) at - - mem_cgroup_newpage_charge() - Called at new page fault and Copy-On-Write. - - mem_cgroup_try_charge_swapin() - Called at do_swap_page() (page fault on swap entry) and swapoff. - Followed by charge-commit-cancel protocol. (With swap accounting) - At commit, a charge recorded in swap_cgroup is removed. - - mem_cgroup_cache_charge() - Called at add_to_page_cache() - - mem_cgroup_cache_charge_swapin() - Called at shmem's swapin. - - mem_cgroup_prepare_migration() - Called before migration. "extra" charge is done and followed by - charge-commit-cancel protocol. - At commit, charge against oldpage or newpage will be committed. - -2. Uncharge - a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - - mem_cgroup_uncharge_page() - Called when an anonymous page is fully unmapped. I.e., mapcount goes - to 0. If the page is SwapCache, uncharge is delayed until - mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_cache_page() - Called when a page-cache is deleted from radix-tree. If the page is - SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_swapcache() - Called when SwapCache is removed from radix-tree. The charge itself - is moved to swap_cgroup. (If mem+swap controller is disabled, no - charge to swap occurs.) - - mem_cgroup_uncharge_swap() - Called when swp_entry's refcnt goes down to 0. A charge against swap - disappears. - - mem_cgroup_end_migration(old, new) - At success of migration old is uncharged (if necessary), a charge - to new page is committed. At failure, charge to old page is committed. - -3. charge-commit-cancel - In some case, we can't know this "charge" is valid or not at charging - (because of races). - To handle such case, there are charge-commit-cancel functions. - mem_cgroup_try_charge_XXX - mem_cgroup_commit_charge_XXX - mem_cgroup_cancel_charge_XXX - these are used in swap-in and migration. - - At try_charge(), there are no flags to say "this page is charged". - at this point, usage += PAGE_SIZE. - - At commit(), the function checks the page should be charged or not - and set flags or avoid charging.(usage -= PAGE_SIZE) - - At cancel(), simply usage -= PAGE_SIZE. - -Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. - -4. Anonymous - Anonymous page is newly allocated at - - page fault into MAP_ANONYMOUS mapping. - - Copy-On-Write. - It is charged right after it's allocated before doing any page table - related operations. Of course, it's uncharged when another page is used - for the fault address. - - At freeing anonymous page (by exit() or munmap()), zap_pte() is called - and pages for ptes are freed one by one.(see mm/memory.c). Uncharges - are done at page_remove_rmap() when page_mapcount() goes down to 0. - - Another page freeing is by page-reclaim (vmscan.c) and anonymous - pages are swapped out. In this case, the page is marked as - PageSwapCache(). uncharge() routine doesn't uncharge the page marked - as SwapCache(). It's delayed until __delete_from_swap_cache(). - - 4.1 Swap-in. - At swap-in, the page is taken from swap-cache. There are 2 cases. - - (a) If the SwapCache is newly allocated and read, it has no charges. - (b) If the SwapCache has been mapped by processes, it has been - charged already. - - This swap-in is one of the most complicated work. In do_swap_page(), - following events occur when pte is unchanged. - - (1) the page (SwapCache) is looked up. - (2) lock_page() - (3) try_charge_swapin() - (4) reuse_swap_page() (may call delete_swap_cache()) - (5) commit_charge_swapin() - (6) swap_free(). - - Considering following situation for example. - - (A) The page has not been charged before (2) and reuse_swap_page() - doesn't call delete_from_swap_cache(). - (B) The page has not been charged before (2) and reuse_swap_page() - calls delete_from_swap_cache(). - (C) The page has been charged before (2) and reuse_swap_page() doesn't - call delete_from_swap_cache(). - (D) The page has been charged before (2) and reuse_swap_page() calls - delete_from_swap_cache(). - - memory.usage/memsw.usage changes to this page/swp_entry will be - Case (A) (B) (C) (D) - Event - Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 - =========================================== - (3) +1/+1 +1/+1 +1/+1 +1/+1 - (4) - 0/ 0 - -1/ 0 - (5) 0/-1 0/ 0 -1/-1 0/ 0 - (6) - 0/-1 - 0/-1 - =========================================== - Result 1/ 1 1/ 1 1/ 1 1/ 1 - - In any cases, charges to this page should be 1/ 1. - - 4.2 Swap-out. - At swap-out, typical state transition is below. - - (a) add to swap cache. (marked as SwapCache) - swp_entry's refcnt += 1. - (b) fully unmapped. - swp_entry's refcnt += # of ptes. - (c) write back to swap. - (d) delete from swap cache. (remove from SwapCache) - swp_entry's refcnt -= 1. - - - At (b), the page is marked as SwapCache and not uncharged. - At (d), the page is removed from SwapCache and a charge in page_cgroup - is moved to swap_cgroup. - - Finally, at task exit, - (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - Here, a charge in swap_cgroup disappears. - -5. Page Cache - Page Cache is charged at - - add_to_page_cache_locked(). - - uncharged at - - __remove_from_page_cache(). - - The logic is very clear. (About migration, see below) - Note: __remove_from_page_cache() is called by remove_from_page_cache() - and __remove_mapping(). - -6. Shmem(tmpfs) Page Cache - Memcg's charge/uncharge have special handlers of shmem. The best way - to understand shmem's page state transition is to read mm/shmem.c. - But brief explanation of the behavior of memcg around shmem will be - helpful to understand the logic. - - Shmem's page (just leaf page, not direct/indirect block) can be on - - radix-tree of shmem's inode. - - SwapCache. - - Both on radix-tree and SwapCache. This happens at swap-in - and swap-out, - - It's charged when... - - A new page is added to shmem's radix-tree. - - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - It's uncharged when - - A page is removed from radix-tree and not SwapCache. - - When SwapCache is removed, a charge is moved to swap_cgroup. - - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup - disappears. - -7. Page Migration - One of the most complicated functions is page-migration-handler. - Memcg has 2 routines. Assume that we are migrating a page's contents - from OLDPAGE to NEWPAGE. - - Usual migration logic is.. - (a) remove the page from LRU. - (b) allocate NEWPAGE (migration target) - (c) lock by lock_page(). - (d) unmap all mappings. - (e-1) If necessary, replace entry in radix-tree. - (e-2) move contents of a page. - (f) map all mappings again. - (g) pushback the page to LRU. - (-) OLDPAGE will be freed. - - Before (g), memcg should complete all necessary charge/uncharge to - NEWPAGE/OLDPAGE. - - The point is.... - - If OLDPAGE is anonymous, all charges will be dropped at (d) because - try_to_unmap() drops all mapcount and the page will not be - SwapCache. - - - If OLDPAGE is SwapCache, charges will be kept at (g) because - __delete_from_swap_cache() isn't called at (e-1) - - - If OLDPAGE is page-cache, charges will be kept at (g) because - remove_from_swap_cache() isn't called at (e-1) - - memcg provides following hooks. - - - mem_cgroup_prepare_migration(OLDPAGE) - Called after (b) to account a charge (usage += PAGE_SIZE) against - memcg which OLDPAGE belongs to. - - - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) - Called after (f) before (g). - If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already - charged, a charge by prepare_migration() is automatically canceled. - If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. - - But zap_pte() (by exit or munmap) can be called while migration, - we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). - -8. LRU - Each memcg has its own private LRU. Now, it's handling is under global - VM's control (means that it's handled under global zone->lru_lock). - Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). - - A special function is mem_cgroup_isolate_pages(). This scans - memcg's private LRU and call __isolate_lru_page() to extract a page - from LRU. - (By __isolate_lru_page(), the page is removed from both of global and - private LRU.) - - -9. Typical Tests. - - Tests for racy cases. - - 9.1 Small limit to memcg. - When you do test to do racy case, it's good test to set memcg's limit - to be very small rather than GB. Many races found in the test under - xKB or xxMB limits. - (Memory behavior under GB and Memory behavior under MB shows very - different situation.) - - 9.2 Shmem - Historically, memcg's shmem handling was poor and we saw some amount - of troubles here. This is because shmem is page-cache but can be - SwapCache. Test with shmem/tmpfs is always good test. - - 9.3 Migration - For NUMA, migration is an another special case. To do easy test, cpuset - is useful. Following is a sample script to do migration. - - mount -t cgroup -o cpuset none /opt/cpuset - - mkdir /opt/cpuset/01 - echo 1 > /opt/cpuset/01/cpuset.cpus - echo 0 > /opt/cpuset/01/cpuset.mems - echo 1 > /opt/cpuset/01/cpuset.memory_migrate - mkdir /opt/cpuset/02 - echo 1 > /opt/cpuset/02/cpuset.cpus - echo 1 > /opt/cpuset/02/cpuset.mems - echo 1 > /opt/cpuset/02/cpuset.memory_migrate - - In above set, when you moves a task from 01 to 02, page migration to - node 0 to node 1 will occur. Following is a script to migrate all - under cpuset. - -- - move_task() - { - for pid in $1 - do - /bin/echo $pid >$2/tasks 2>/dev/null - echo -n $pid - echo -n " " - done - echo END - } - - G1_TASK=`cat ${G1}/tasks` - G2_TASK=`cat ${G2}/tasks` - move_task "${G1_TASK}" ${G2} & - -- - 9.4 Memory hotplug. - memory hotplug test is one of good test. - to offline memory, do following. - # echo offline > /sys/devices/system/memory/memoryXXX/state - (XXX is the place of memory) - This is an easy way to test page migration, too. - - 9.5 mkdir/rmdir - When using hierarchy, mkdir/rmdir test should be done. - Use tests like the following. - - echo 1 >/opt/cgroup/01/memory/use_hierarchy - mkdir /opt/cgroup/01/child_a - mkdir /opt/cgroup/01/child_b - - set limit to 01. - add limit to 01/child_b - run jobs under child_a and child_b - - create/delete following groups at random while jobs are running. - /opt/cgroup/01/child_a/child_aa - /opt/cgroup/01/child_b/child_bb - /opt/cgroup/01/child_c - - running new jobs in new group is also good. - - 9.6 Mount with other subsystems. - Mounting with other subsystems is a good test because there is a - race and lock dependency with other cgroup subsystems. - - example) - # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices - - and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt deleted file mode 100644 index e1501964df1e..000000000000 --- a/Documentation/controllers/memory.txt +++ /dev/null @@ -1,399 +0,0 @@ -Memory Resource Controller - -NOTE: The Memory Resource Controller has been generically been referred -to as the memory controller in this document. Do not confuse memory controller -used here with the memory controller that is used in hardware. - -Salient features - -a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages -b. The infrastructure allows easy addition of other types of memory to control -c. Provides *zero overhead* for non memory controller users -d. Provides a double LRU: global memory pressure causes reclaim from the - global LRU; a cgroup on hitting a limit, reclaims from the per - cgroup LRU - -NOTE: Swap Cache (unmapped) is not accounted now. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with limited amount of memory, this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases, find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the res_counter. The res_counter -tracks the current memory usage and limit of the group of processes associated -with the controller. Each cgroup has a memory controller specific data -structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (res_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge() is invoked to setup -the necessary data structures and check if the cgroup that is being charged -is over its limit. If it is then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -allocated and associated with the page. This routine also adds the page to -the per cgroup LRU. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -(some pages which never be reclaimable and will not be on global LRU - are not accounted. we just accounts pages under usual vm management.) - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -A RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. - -At page migration, accounting information is kept. - -Note: we just account pages-on-lru because our purpose is to control amount -of used pages. not-on-lru pages are tend to be out-of-control from vm view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. -When you do swapoff and make swapped-out pages of shmem(tmpfs) to -be backed into memory in force, charges for pages are accounted against the -caller of swapoff rather than the users of shmem. - - -2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) -Swap Extension allows you to record charge for swap. A swapped-in page is -charged back to original page allocator if possible. - -When swap is accounted, following files are added. - - memory.memsw.usage_in_bytes. - - memory.memsw.limit_in_bytes. - -usage of mem+swap is limited by memsw.limit_in_bytes. - -Note: why 'mem+swap' rather than swap. -The global LRU(kswapd) can swap out arbitrary pages. Swap-out means -to move account from memory to swap...there is no change in usage of -mem+swap. - -In other words, when we want to limit the usage of swap without affecting -global LRU, mem+swap limit is better than just limiting swap from OS point -of view. - -2.5 Reclaim - -Each cgroup maintains a per cgroup LRU that consists of an active -and inactive list. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per cgroup LRU -list. - -2. Locking - -The memory controller uses the following hierarchy - -1. zone->lru_lock is used for selecting pages to be isolated -2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) -3. lock_page_cgroup() is used to protect page->page_cgroup - -3. User Interface - -0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_CGROUP_MEM_RES_CTLR - -1. Prepare the cgroups -# mkdir -p /cgroups -# mount -t cgroup none /cgroups -o memory - -2. Make the new group and move bash into it -# mkdir /cgroups/0 -# echo $$ > /cgroups/0/tasks - -Since now we're in the 0 cgroup, -We can alter the memory limit: -# echo 4M > /cgroups/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. - -# cat /cgroups/0/memory.limit_in_bytes -4194304 - -NOTE: The interface has now changed to display the usage in bytes -instead of pages - -We can check the usage: -# cat /cgroups/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful set of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -4. Testing - -Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. -Apart from that v6 has been tested with several applications and regular -daily use. The controller has also been tested on the PPC64, x86_64 and -UML platforms. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -4.2 Task migration - -When a task migrates from one cgroup to another, it's charge is not -carried forward. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. -Such charges are freed(at default) or moved to its parent. When moved, -both of RSS and CACHES are moved to parent. -If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. - -Charges recorded in swap information is not updated at removal of cgroup. -Recorded information is discarded and a cgroup which uses swap (swapcache) -will be charged as a new owner of it. - - -5. Misc. interfaces. - -5.1 force_empty - memory.force_empty interface is provided to make cgroup's memory usage empty. - You can use this interface only when the cgroup has no tasks. - When writing anything to this - - # echo 0 > memory.force_empty - - Almost all pages tracked by this memcg will be unmapped and freed. Some of - pages cannot be freed because it's locked or in-use. Such pages are moved - to parent and this cgroup will be empty. But this may return -EBUSY in - some too busy case. - - Typical use case of this interface is that calling this before rmdir(). - Because rmdir() moves all pages to parent, some out-of-use page caches can be - moved to the parent. If you want to avoid that, force_empty will be useful. - -5.2 stat file - memory.stat file includes following statistics (now) - cache - # of pages from page-cache and shmem. - rss - # of pages from anonymous memory. - pgpgin - # of event of charging - pgpgout - # of event of uncharging - active_anon - # of pages on active lru of anon, shmem. - inactive_anon - # of pages on active lru of anon, shmem - active_file - # of pages on active lru of file-cache - inactive_file - # of pages on inactive lru of file cache - unevictable - # of pages cannot be reclaimed.(mlocked etc) - - Below is depend on CONFIG_DEBUG_VM. - inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) - recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) - recent_rotated_file - VM internal parameter. (see mm/vmscan.c) - recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) - recent_scanned_file - VM internal parameter. (see mm/vmscan.c) - - Memo: - recent_rotated means recent frequency of lru rotation. - recent_scanned means recent # of scans to lru. - showing for better debug please see the code for meanings. - - -5.3 swappiness - Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. - - Following cgroup's swapiness can't be changed. - - root cgroup (uses /proc/sys/vm/swappiness). - - a cgroup which uses hierarchy and it has child cgroup. - - a cgroup which uses hierarchy and not the root of hierarchy. - - -6. Hierarchy support - -The memory controller supports a deep hierarchy and hierarchical accounting. -The hierarchy is created by creating the appropriate cgroups in the -cgroup filesystem. Consider for example, the following cgroup filesystem -hierarchy - - root - / | \ - / | \ - a b c - | \ - | \ - d e - -In the diagram above, with hierarchical accounting enabled, all memory -usage of e, is accounted to its ancestors up until the root (i.e, c and root), -that has memory.use_hierarchy enabled. If one of the ancestors goes over its -limit, the reclaim algorithm reclaims from the tasks in the ancestor and the -children of the ancestor. - -6.1 Enabling hierarchical accounting and reclaim - -The memory controller by default disables the hierarchy feature. Support -can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup - -# echo 1 > memory.use_hierarchy - -The feature can be disabled by - -# echo 0 > memory.use_hierarchy - -NOTE1: Enabling/disabling will fail if the cgroup already has other -cgroups created below it. - -NOTE2: This feature can be enabled/disabled per subtree. - -7. TODO - -1. Add support for accounting huge pages (as a separate controller) -2. Make per-cgroup scanner reclaim not-shared pages first -3. Teach controller to account for shared-pages -4. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt deleted file mode 100644 index f196ac1d7d25..000000000000 --- a/Documentation/controllers/resource_counter.txt +++ /dev/null @@ -1,181 +0,0 @@ - - The Resource Counter - -The resource counter, declared at include/linux/res_counter.h, -is supposed to facilitate the resource management by controllers -by providing common stuff for accounting. - -This "stuff" includes the res_counter structure and routines -to work with it. - - - -1. Crucial parts of the res_counter structure - - a. unsigned long long usage - - The usage value shows the amount of a resource that is consumed - by a group at a given time. The units of measurement should be - determined by the controller that uses this counter. E.g. it can - be bytes, items or any other unit the controller operates on. - - b. unsigned long long max_usage - - The maximal value of the usage over time. - - This value is useful when gathering statistical information about - the particular group, as it shows the actual resource requirements - for a particular group, not just some usage snapshot. - - c. unsigned long long limit - - The maximal allowed amount of resource to consume by the group. In - case the group requests for more resources, so that the usage value - would exceed the limit, the resource allocation is rejected (see - the next section). - - d. unsigned long long failcnt - - The failcnt stands for "failures counter". This is the number of - resource allocation attempts that failed. - - c. spinlock_t lock - - Protects changes of the above values. - - - -2. Basic accounting routines - - a. void res_counter_init(struct res_counter *rc) - - Initializes the resource counter. As usual, should be the first - routine called for a new counter. - - b. int res_counter_charge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is about to be allocated it has to be accounted - with the appropriate resource counter (controller should determine - which one to use on its own). This operation is called "charging". - - This is not very important which operation - resource allocation - or charging - is performed first, but - * if the allocation is performed first, this may create a - temporary resource over-usage by the time resource counter is - charged; - * if the charging is performed first, then it should be uncharged - on error path (if the one is called). - - c. void res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is released (freed) it should be de-accounted - from the resource counter it was accounted to. This is called - "uncharging". - - The _locked routines imply that the res_counter->lock is taken. - - - 2.1 Other accounting routines - - There are more routines that may help you with common needs, like - checking whether the limit is reached or resetting the max_usage - value. They are all declared in include/linux/res_counter.h. - - - -3. Analyzing the resource counter registrations - - a. If the failcnt value constantly grows, this means that the counter's - limit is too tight. Either the group is misbehaving and consumes too - many resources, or the configuration is not suitable for the group - and the limit should be increased. - - b. The max_usage value can be used to quickly tune the group. One may - set the limits to maximal values and either load the container with - a common pattern or leave one for a while. After this the max_usage - value shows the amount of memory the container would require during - its common activity. - - Setting the limit a bit above this value gives a pretty good - configuration that works in most of the cases. - - c. If the max_usage is much less than the limit, but the failcnt value - is growing, then the group tries to allocate a big chunk of resource - at once. - - d. If the max_usage is much less than the limit, but the failcnt value - is 0, then this group is given too high limit, that it does not - require. It is better to lower the limit a bit leaving more resource - for other groups. - - - -4. Communication with the control groups subsystem (cgroups) - -All the resource controllers that are using cgroups and resource counters -should provide files (in the cgroup filesystem) to work with the resource -counter fields. They are recommended to adhere to the following rules: - - a. File names - - Field name File name - --------------------------------------------------- - usage usage_in_ - max_usage max_usage_in_ - limit limit_in_ - failcnt failcnt - lock no file :) - - b. Reading from file should show the corresponding field value in the - appropriate format. - - c. Writing to file - - Field Expected behavior - ---------------------------------- - usage prohibited - max_usage reset to usage - limit set the limit - failcnt reset to zero - - - -5. Usage example - - a. Declare a task group (take a look at cgroups subsystem for this) and - fold a res_counter into it - - struct my_group { - struct res_counter res; - - - } - - b. Put hooks in resource allocation/release paths - - int alloc_something(...) - { - if (res_counter_charge(res_counter_ptr, amount) < 0) - return -ENOMEM; - - - } - - void release_something(...) - { - res_counter_uncharge(res_counter_ptr, amount); - - - } - - In order to keep the usage value self-consistent, both the - "res_counter_ptr" and the "amount" in release_something() should be - the same as they were in the alloc_something() when the releasing - resource was allocated. - - c. Provide the way to read res_counter values and set them (the cgroups - still can help with it). - - c. Compile and run :) diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt deleted file mode 100644 index 5c86c258c791..000000000000 --- a/Documentation/cpusets.txt +++ /dev/null @@ -1,808 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a tasks current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that tasks cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting tasks mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that tasks cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that tasks cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendents) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that tasks cpuset. - - in sched.c migrate_all_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that tasks cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the tasks cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpus: list of CPUs in that cpuset - - mems: list of Memory Nodes in that cpuset - - memory_migrate flag: if set, move pages to cpusets nodes - - cpu_exclusive flag: is cpu placement exclusive? - - mem_exclusive flag: is memory placement exclusive? - - mem_hardwall flag: is memory allocation hardwalled - - memory_pressure: measure of how much paging pressure in cpuset - -In addition, the root cpuset only has the following file: - - memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_map using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendent, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'memory_spread_page' and -'memory_spread_slab'. - -If the per-cpuset boolean flag file 'memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the tasks NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the tasks NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing tasks memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'memory_spread_page' turns on a per-process flag -PF_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PF_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'memory_spread_slab' turns on the flag -PF_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current tasks mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendent cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside it cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all -the CPUs that must be load balanced. - -Whenever the 'sched_load_balance' flag changes, or CPUs come or go -from a cpuset with this flag enabled, or a cpuset with this flag -enabled is removed, the cpuset code builds a new such partition and -passes it to the scheduler sched domain setup code, to have the sched -domains rebuilt as necessary. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (cpumask_t) in the partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -everytime. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searchs all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - ( 4 : search nodes in a chunk of node [on NUMA system] ) - ( 5 : search system wide [on NUMA system] ) - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'sched_load_balance' of a cpuset -is disabled, then 'sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not will be depend on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the tasks cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its numa placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the tasks -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a tasks pid is written to a cpusets 'tasks' file, in either its -current cpuset or another cpuset, then its allowed CPU placement is -changed immediately. If such a task had been bound to some subset -of its cpuset using the sched_setaffinity() call, the task will be -allowed to run on any CPU allowed in its new cpuset, negating the -affect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -but the processor placement is not updated, until that tasks pid is -rewritten to the 'tasks' file of its cpuset. This is done to avoid -impacting the scheduler code in the kernel with a check for changes -in a tasks processor placement. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'mems' subsequently changes. -If the cpuset flag file 'memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the tasks new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'memory_migrate' is set true, then if that cpusets -'mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the tasks prior cpuset, or in the cpusets -prior 'mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current tasks cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /dev/cpuset - 2) mount -t cgroup -ocpuset cpuset /dev/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /dev/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /dev/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /dev/cpuset - cd /dev/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpus - /bin/echo 1 > mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -In the future, a C library interface to cpusets will likely be -available. For now, the only way to query or modify cpusets is -via the cpuset file system, using the various cd, mkdir, echo, cat, -rmdir commands from the shell, or their equivalent from C. - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /dev/cpuset - -Then under /dev/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /dev/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /dev/cpuset: -# cd /dev/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cpu_exclusive memory_migrate mems tasks -cpus memory_pressure notify_on_release -mem_exclusive memory_spread_page sched_load_balance -mem_hardwall memory_spread_slab sched_relax_domain_level - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpus - -Add some mems: -# /bin/echo 0-7 > mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /dev/cpuset - -is equivalent to - -mount -t cgroup -ocpuset X /dev/cpuset -echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' -# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 8398ca4ff4ed..6f33593e59e2 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -231,7 +231,7 @@ CPU bandwidth control purposes: This options needs CONFIG_CGROUPS to be defined, and lets the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this filesystem. + Documentation/cgroups/cgroups.txt for more information about this filesystem. Only one of these options to group tasks can be chosen and not both. diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index dede0a2cfc45..4c5bcf6ca7e8 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -9,7 +9,7 @@ * * Author: Pavel Emelianov * - * See Documentation/controllers/resource_counter.txt for more + * See Documentation/cgroups/resource_counter.txt for more * info about what this counter is. */ diff --git a/init/Kconfig b/init/Kconfig index 56fd93c63c77..2af83825634e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -323,8 +323,8 @@ config CGROUP_SCHED This option allows you to create arbitrary task groups using the "cgroup" pseudo filesystem and control the cpu bandwidth allocated to each such task group. - Refer to Documentation/cgroups.txt for more information - on "cgroup" pseudo filesystem. + Refer to Documentation/cgroups/cgroups.txt for more + information on "cgroup" pseudo filesystem. endchoice @@ -335,10 +335,9 @@ menuconfig CGROUPS use with process control subsystems such as Cpusets, CFS, memory controls or device isolation. See - - Documentation/cpusets.txt (Cpusets) - Documentation/scheduler/sched-design-CFS.txt (CFS) - - Documentation/cgroups/ (features for grouping, isolation) - - Documentation/controllers/ (features for resource control) + - Documentation/cgroups/ (features for grouping, isolation + and resource control) Say N if unsure. diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 647c77a88fcb..a85678865c5e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -568,7 +568,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this -- cgit v1.2.3 From 6ae301e85c9c58d2f430a8a7057ce488b7ff76df Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jan 2009 13:51:01 -0800 Subject: resources: fix parameter name and kernel-doc Fix __request_region() parameter kernel-doc notation and parameter name: Warning(linux-2.6.28-git10//kernel/resource.c:627): No description found for parameter 'flags' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 ++- kernel/resource.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index f6bb2ca8e3ba..32e4b2f72294 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -143,7 +143,8 @@ static inline unsigned long resource_type(struct resource *res) extern struct resource * __request_region(struct resource *, resource_size_t start, - resource_size_t n, const char *name, int relaxed); + resource_size_t n, + const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) diff --git a/kernel/resource.c b/kernel/resource.c index ca6a1536b205..fd5d7d574bb9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res) * @start: resource start address * @n: resource region size * @name: reserving caller's ID string + * @flags: IO resource flags */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, -- cgit v1.2.3 From 33f1d7ecc6cffff3c618a02295de969ebbacd95d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 Jan 2009 21:14:04 +0100 Subject: PM: Fix freezer compilation if PM_SLEEP is unset Freezer fails to compile if with the following configuration settings: CONFIG_CGROUPS=y CONFIG_CGROUP_FREEZER=y CONFIG_MODULES=y CONFIG_FREEZER=y CONFIG_PM=y CONFIG_PM_SLEEP=n Fix this by making process.o compilation depend on CONFIG_FREEZER. Reported-by: Cheng Renquan Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Len Brown --- kernel/power/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 597823b5b700..d7a10167a25b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,8 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o -obj-$(CONFIG_PM_SLEEP) += process.o console.o +obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -- cgit v1.2.3 From 5a4ccaf37ffece09ef33f1cfec67efa8ee56f967 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 21:15:32 +0100 Subject: kprobes: check CONFIG_FREEZER instead of CONFIG_PM Check CONFIG_FREEZER instead of CONFIG_PM because kprobe booster depends on freeze_processes() and thaw_processes() when CONFIG_PREEMPT=y. This fixes a linkage error which occurs when CONFIG_PREEMPT=y, CONFIG_PM=y and CONFIG_FREEZER=n. Reported-by: Cheng Renquan Signed-off-by: Masami Hiramatsu Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar Signed-off-by: Len Brown --- arch/ia64/kernel/kprobes.c | 2 +- arch/x86/kernel/kprobes.c | 2 +- kernel/kprobes.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index f90be51b1123..9adac441ac9b 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -870,7 +870,7 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) return 1; ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ ia64_psr(regs)->ri = p->ainsn.slot; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 884d985b8b82..e948b28a5a9a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -446,7 +446,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1b9cbdc0127a..7ba8cd9845cb 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -123,7 +123,7 @@ static int collect_garbage_slots(void); static int __kprobes check_safety(void) { int ret = 0; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) +#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER) ret = freeze_processes(); if (ret == 0) { struct task_struct *p, *q; -- cgit v1.2.3