From 26047e2d6bde5b2e1b791e0ec1c3234894fdf3fa Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 30 Nov 2012 11:28:55 +0100 Subject: ASoC: cs4271: fix sparse warning Make the flag in the pdata of type bool to fix a sparse warning. Signed-off-by: Daniel Mack Reported-by: Fengguang Wu Signed-off-by: Mark Brown --- include/sound/cs4271.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/cs4271.h b/include/sound/cs4271.h index 6d9e15ed1dcf..dd8c48d14ed9 100644 --- a/include/sound/cs4271.h +++ b/include/sound/cs4271.h @@ -19,7 +19,7 @@ struct cs4271_platform_data { int gpio_nreset; /* GPIO driving Reset pin, if any */ - int amutec_eq_bmutec:1; /* flag to enable AMUTEC=BMUTEC */ + bool amutec_eq_bmutec; /* flag to enable AMUTEC=BMUTEC */ }; #endif /* __CS4271_H */ -- cgit v1.2.3 From 9bde4f0b1c83d1129a9fc8ec5b2611ba6dab1215 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Dec 2012 16:05:00 +0000 Subject: ASoC: core: Fix SOC_DOUBLE_RANGE() macros Although we've had macros defining double _RANGE controls for a while now they've not actually been backed up properly by the implementation, it's treated everything as mono. Fix that by implementing the handling in the stereo controls, ensuring that the mono controls don't mistakenly get treated as stereo. Signed-off-by: Mark Brown Acked-by: Liam Girdwood --- include/sound/soc.h | 10 ++++++---- sound/soc/soc-core.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 769e27c774a3..bc56738cb109 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -58,8 +58,9 @@ .info = snd_soc_info_volsw_range, .get = snd_soc_get_volsw_range, \ .put = snd_soc_put_volsw_range, \ .private_value = (unsigned long)&(struct soc_mixer_control) \ - {.reg = xreg, .shift = xshift, .min = xmin,\ - .max = xmax, .platform_max = xmax, .invert = xinvert} } + {.reg = xreg, .rreg = xreg, .shift = xshift, \ + .rshift = xshift, .min = xmin, .max = xmax, \ + .platform_max = xmax, .invert = xinvert} } #define SOC_SINGLE_TLV(xname, reg, shift, max, invert, tlv_array) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ .access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |\ @@ -88,8 +89,9 @@ .info = snd_soc_info_volsw_range, \ .get = snd_soc_get_volsw_range, .put = snd_soc_put_volsw_range, \ .private_value = (unsigned long)&(struct soc_mixer_control) \ - {.reg = xreg, .shift = xshift, .min = xmin,\ - .max = xmax, .platform_max = xmax, .invert = xinvert} } + {.reg = xreg, .rreg = xreg, .shift = xshift, \ + .rshift = xshift, .min = xmin, .max = xmax, \ + .platform_max = xmax, .invert = xinvert} } #define SOC_DOUBLE(xname, reg, shift_left, shift_right, max, invert) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname),\ .info = snd_soc_info_volsw, .get = snd_soc_get_volsw, \ diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 91d592ff67b7..e0d4630bfb4f 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2917,7 +2917,7 @@ int snd_soc_info_volsw_range(struct snd_kcontrol *kcontrol, platform_max = mc->platform_max; uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; - uinfo->count = 1; + uinfo->count = snd_soc_volsw_is_stereo(mc) ? 2 : 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = platform_max - min; @@ -2941,12 +2941,14 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, (struct soc_mixer_control *)kcontrol->private_value; struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); unsigned int reg = mc->reg; + unsigned int rreg = mc->rreg; unsigned int shift = mc->shift; int min = mc->min; int max = mc->max; unsigned int mask = (1 << fls(max)) - 1; unsigned int invert = mc->invert; unsigned int val, val_mask; + int ret; val = ((ucontrol->value.integer.value[0] + min) & mask); if (invert) @@ -2954,7 +2956,21 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, val_mask = mask << shift; val = val << shift; - return snd_soc_update_bits_locked(codec, reg, val_mask, val); + ret = snd_soc_update_bits_locked(codec, reg, val_mask, val); + if (ret != 0) + return ret; + + if (snd_soc_volsw_is_stereo(mc)) { + val = ((ucontrol->value.integer.value[1] + min) & mask); + if (invert) + val = max - val; + val_mask = mask << shift; + val = val << shift; + + ret = snd_soc_update_bits_locked(codec, rreg, val_mask, val); + } + + return ret; } EXPORT_SYMBOL_GPL(snd_soc_put_volsw_range); @@ -2974,11 +2990,13 @@ int snd_soc_get_volsw_range(struct snd_kcontrol *kcontrol, (struct soc_mixer_control *)kcontrol->private_value; struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol); unsigned int reg = mc->reg; + unsigned int rreg = mc->rreg; unsigned int shift = mc->shift; int min = mc->min; int max = mc->max; unsigned int mask = (1 << fls(max)) - 1; unsigned int invert = mc->invert; + int ret; ucontrol->value.integer.value[0] = (snd_soc_read(codec, reg) >> shift) & mask; @@ -2988,6 +3006,16 @@ int snd_soc_get_volsw_range(struct snd_kcontrol *kcontrol, ucontrol->value.integer.value[0] = ucontrol->value.integer.value[0] - min; + if (snd_soc_volsw_is_stereo(mc)) { + ucontrol->value.integer.value[1] = + (snd_soc_read(codec, rreg) >> shift) & mask; + if (invert) + ucontrol->value.integer.value[1] = + max - ucontrol->value.integer.value[1]; + ucontrol->value.integer.value[1] = + ucontrol->value.integer.value[1] - min; + } + return 0; } EXPORT_SYMBOL_GPL(snd_soc_get_volsw_range); -- cgit v1.2.3 From 901593f2bf221659a605bdc1dcb11376ea934163 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 19 Dec 2012 16:51:06 +0000 Subject: drm: Only evict the blocks required to create the requested hole Avoid clobbering adjacent blocks if they happen to expire earlier and amalgamate together to form the requested hole. In passing this fixes a regression from commit ea7b1dd44867e9cd6bac67e7c9fc3f128b5b255c Author: Daniel Vetter Date: Fri Feb 18 17:59:12 2011 +0100 drm: mm: track free areas implicitly which swaps the end address for size (with a potential overflow) and effectively causes the eviction code to clobber almost all earlier buffers above the evictee. v2: Check the original hole not the adjusted as the coloring may confuse us when later searching for the overlapping nodes. Also make sure that we do apply the range restriction and color adjustment in the same order for both scanning, searching and insertion. v3: Send the version that was actually tested. Note that this seems to be ducttape of decent quality ot paper over some of our unbind related gpu hangs reported since 3.7. It is not fully effective though, and certainly doesn't fix the underlying bug. Signed-off-by: Chris Wilson Cc: Daniel Vetter [danvet: Added note plus bugzilla link and tested-by.] Cc: stable@vger.kernel.org Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=55984 Tested-by: Norbert Preining Acked-by: Dave Airlie --- drivers/gpu/drm/drm_mm.c | 45 +++++++++++++++++---------------------------- include/drm/drm_mm.h | 2 +- 2 files changed, 18 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 2bf9670ba29b..2aa331499f81 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -221,11 +221,13 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, BUG_ON(!hole_node->hole_follows || node->allocated); - if (mm->color_adjust) - mm->color_adjust(hole_node, color, &adj_start, &adj_end); - if (adj_start < start) adj_start = start; + if (adj_end > end) + adj_end = end; + + if (mm->color_adjust) + mm->color_adjust(hole_node, color, &adj_start, &adj_end); if (alignment) { unsigned tmp = adj_start % alignment; @@ -506,7 +508,7 @@ void drm_mm_init_scan(struct drm_mm *mm, mm->scan_size = size; mm->scanned_blocks = 0; mm->scan_hit_start = 0; - mm->scan_hit_size = 0; + mm->scan_hit_end = 0; mm->scan_check_range = 0; mm->prev_scanned_node = NULL; } @@ -533,7 +535,7 @@ void drm_mm_init_scan_with_range(struct drm_mm *mm, mm->scan_size = size; mm->scanned_blocks = 0; mm->scan_hit_start = 0; - mm->scan_hit_size = 0; + mm->scan_hit_end = 0; mm->scan_start = start; mm->scan_end = end; mm->scan_check_range = 1; @@ -552,8 +554,7 @@ int drm_mm_scan_add_block(struct drm_mm_node *node) struct drm_mm *mm = node->mm; struct drm_mm_node *prev_node; unsigned long hole_start, hole_end; - unsigned long adj_start; - unsigned long adj_end; + unsigned long adj_start, adj_end; mm->scanned_blocks++; @@ -570,14 +571,8 @@ int drm_mm_scan_add_block(struct drm_mm_node *node) node->node_list.next = &mm->prev_scanned_node->node_list; mm->prev_scanned_node = node; - hole_start = drm_mm_hole_node_start(prev_node); - hole_end = drm_mm_hole_node_end(prev_node); - - adj_start = hole_start; - adj_end = hole_end; - - if (mm->color_adjust) - mm->color_adjust(prev_node, mm->scan_color, &adj_start, &adj_end); + adj_start = hole_start = drm_mm_hole_node_start(prev_node); + adj_end = hole_end = drm_mm_hole_node_end(prev_node); if (mm->scan_check_range) { if (adj_start < mm->scan_start) @@ -586,11 +581,14 @@ int drm_mm_scan_add_block(struct drm_mm_node *node) adj_end = mm->scan_end; } + if (mm->color_adjust) + mm->color_adjust(prev_node, mm->scan_color, + &adj_start, &adj_end); + if (check_free_hole(adj_start, adj_end, mm->scan_size, mm->scan_alignment)) { mm->scan_hit_start = hole_start; - mm->scan_hit_size = hole_end; - + mm->scan_hit_end = hole_end; return 1; } @@ -626,19 +624,10 @@ int drm_mm_scan_remove_block(struct drm_mm_node *node) node_list); prev_node->hole_follows = node->scanned_preceeds_hole; - INIT_LIST_HEAD(&node->node_list); list_add(&node->node_list, &prev_node->node_list); - /* Only need to check for containement because start&size for the - * complete resulting free block (not just the desired part) is - * stored. */ - if (node->start >= mm->scan_hit_start && - node->start + node->size - <= mm->scan_hit_start + mm->scan_hit_size) { - return 1; - } - - return 0; + return (drm_mm_hole_node_end(node) > mm->scan_hit_start && + node->start < mm->scan_hit_end); } EXPORT_SYMBOL(drm_mm_scan_remove_block); diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h index 0f4a366f6fa6..3527fb3f75bb 100644 --- a/include/drm/drm_mm.h +++ b/include/drm/drm_mm.h @@ -70,7 +70,7 @@ struct drm_mm { unsigned long scan_color; unsigned long scan_size; unsigned long scan_hit_start; - unsigned scan_hit_size; + unsigned long scan_hit_end; unsigned scanned_blocks; unsigned long scan_start; unsigned long scan_end; -- cgit v1.2.3 From 54b956b903607f8f8878754dd4352da6a54a1da2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 10 Jan 2013 10:57:01 -0800 Subject: Remove __dev* markings from init.h Now that all in-kernel users of __dev* are gone, let's remove them from init.h to keep them from popping up again and again. Thanks to Bill Pemberton for doing all of the hard work to make removal of this possible. Cc: Bill Pemberton Cc: Stephen Rothwell Signed-off-by: Greg Kroah-Hartman --- include/linux/init.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index a799273714ac..10ed4f436458 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -93,14 +93,6 @@ #define __exit __section(.exit.text) __exitused __cold notrace -/* Used for HOTPLUG, but that is always enabled now, so just make them noops */ -#define __devinit -#define __devinitdata -#define __devinitconst -#define __devexit -#define __devexitdata -#define __devexitconst - /* Used for HOTPLUG_CPU */ #define __cpuinit __section(.cpuinit.text) __cold notrace #define __cpuinitdata __section(.cpuinit.data) @@ -337,18 +329,6 @@ void __init parse_early_options(char *cmdline); #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ -/* Functions marked as __devexit may be discarded at kernel link time, depending - on config options. Newer versions of binutils detect references from - retained sections to discarded sections and flag an error. Pointers to - __devexit functions must use __devexit_p(function_name), the wrapper will - insert either the function_name or NULL, depending on the config options. - */ -#if defined(MODULE) || defined(CONFIG_HOTPLUG) -#define __devexit_p(x) x -#else -#define __devexit_p(x) NULL -#endif - #ifdef MODULE #define __exit_p(x) x #else -- cgit v1.2.3 From ba829137bfd167623363548aa385be769c6b2664 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Dec 2012 09:53:33 +0100 Subject: target: Introduce TCM_NO_SENSE Introduce TCM_NO_SENSE, mapping to sense code 'Not ready, no additional sense information'. Signed-off-by: Hannes Reinecke Cc: Nicholas Bellinger Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 10 ++++++++++ include/target/target_core_base.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 978762dc81bf..104bf3b45a01 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2597,6 +2597,16 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd, * SENSE KEY values from include/scsi/scsi.h */ switch (reason) { + case TCM_NO_SENSE: + /* CURRENT ERROR */ + buffer[0] = 0x70; + buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; + /* Not Ready */ + buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY; + /* NO ADDITIONAL SENSE INFORMATION */ + buffer[SPC_ASC_KEY_OFFSET] = 0; + buffer[SPC_ASCQ_KEY_OFFSET] = 0; + break; case TCM_NON_EXISTENT_LUN: /* CURRENT ERROR */ buffer[0] = 0x70; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 7cae2360221e..663e34a5383f 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -174,6 +174,7 @@ typedef unsigned __bitwise__ sense_reason_t; enum tcm_sense_reason_table { #define R(x) (__force sense_reason_t )(x) + TCM_NO_SENSE = R(0x00), TCM_NON_EXISTENT_LUN = R(0x01), TCM_UNSUPPORTED_SCSI_OPCODE = R(0x02), TCM_INCORRECT_AMOUNT_OF_DATA = R(0x03), -- cgit v1.2.3 From 47ecfcb7d01418fcbfbc75183ba5e28e98b667b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 09:27:01 +0000 Subject: mm: compaction: Partially revert capture of suitable high-order page Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca "mm: compaction: capture a suitable high-order page immediately when it is made available". Reported-and-tested-by: Eric Wong Tested-by: Eric Dumazet Cc: stable@vger.kernel.org Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 +- include/linux/mm.h | 1 - mm/compaction.c | 92 +++++++--------------------------------------- mm/internal.h | 1 - mm/page_alloc.c | 35 ++++-------------- 5 files changed, 23 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 6ecb6dc2f303..cc7bddeaf553 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync, bool *contended, struct page **page); + bool sync, bool *contended); extern int compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { return COMPACT_CONTINUE; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 63204078f72b..66e2f7c61e5c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); int split_free_page(struct page *page); -int capture_free_page(struct page *page, int alloc_order, int migratetype); /* * Compound pages have a destructor function. Provide a diff --git a/mm/compaction.c b/mm/compaction.c index 6b807e466497..2c570432aa56 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1069,8 +1007,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -1203,7 +1138,6 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..9ba21100ebf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -135,7 +135,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc6cc0e913bd..ece7b8e8a5b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1384,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1399,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); if (mt != MIGRATE_ISOLATE) { @@ -1408,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1416,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1431,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1449,10 +1438,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -2136,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2149,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2168,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; -- cgit v1.2.3 From 896f97ea95c1d29c0520ee0766b66b7f64cb967c Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Fri, 11 Jan 2013 14:31:36 -0800 Subject: lib: cpu_rmap: avoid flushing all workqueues In some cases, free_irq_cpu_rmap() is called while holding a lock (eg rtnl). This can lead to deadlocks, because it invokes flush_scheduled_work() which ends up waiting for whole system workqueue to flush, but some pending works might try to acquire the lock we are already holding. This commit uses reference-counting to replace irq_run_affinity_notifiers(). It also removes irq_run_affinity_notifiers() altogether. [akpm@linux-foundation.org: eliminate free_cpu_rmap, rename cpu_rmap_reclaim() to cpu_rmap_release(), propagate kref_put() retval from cpu_rmap_put()] Signed-off-by: David Decotigny Reviewed-by: Ben Hutchings Acked-by: Eric Dumazet Reviewed-by: Josh Triplett Cc: "David S. Miller" Cc: Or Gerlitz Acked-by: Amir Vadai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu_rmap.h | 13 ++++-------- include/linux/interrupt.h | 5 ----- lib/cpu_rmap.c | 54 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 53 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index ac3bbb5b9502..1739510d8994 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -13,9 +13,11 @@ #include #include #include +#include /** * struct cpu_rmap - CPU affinity reverse-map + * @refcount: kref for object * @size: Number of objects to be reverse-mapped * @used: Number of objects added * @obj: Pointer to array of object pointers @@ -23,6 +25,7 @@ * based on affinity masks */ struct cpu_rmap { + struct kref refcount; u16 size, used; void **obj; struct { @@ -33,15 +36,7 @@ struct cpu_rmap { #define CPU_RMAP_DIST_INF 0xffff extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags); - -/** - * free_cpu_rmap - free CPU affinity reverse-map - * @rmap: Reverse-map allocated with alloc_cpu_rmap(), or %NULL - */ -static inline void free_cpu_rmap(struct cpu_rmap *rmap) -{ - kfree(rmap); -} +extern int cpu_rmap_put(struct cpu_rmap *rmap); extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj); extern int cpu_rmap_update(struct cpu_rmap *rmap, u16 index, diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5e4e6170f43a..5fa5afeeb759 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -268,11 +268,6 @@ struct irq_affinity_notify { extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -static inline void irq_run_affinity_notifiers(void) -{ - flush_scheduled_work(); -} - #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 145dec5267c9..5fbed5caba6e 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -45,6 +45,7 @@ struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags) if (!rmap) return NULL; + kref_init(&rmap->refcount); rmap->obj = (void **)((char *)rmap + obj_offset); /* Initially assign CPUs to objects on a rota, since we have @@ -63,6 +64,35 @@ struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags) } EXPORT_SYMBOL(alloc_cpu_rmap); +/** + * cpu_rmap_release - internal reclaiming helper called from kref_put + * @ref: kref to struct cpu_rmap + */ +static void cpu_rmap_release(struct kref *ref) +{ + struct cpu_rmap *rmap = container_of(ref, struct cpu_rmap, refcount); + kfree(rmap); +} + +/** + * cpu_rmap_get - internal helper to get new ref on a cpu_rmap + * @rmap: reverse-map allocated with alloc_cpu_rmap() + */ +static inline void cpu_rmap_get(struct cpu_rmap *rmap) +{ + kref_get(&rmap->refcount); +} + +/** + * cpu_rmap_put - release ref on a cpu_rmap + * @rmap: reverse-map allocated with alloc_cpu_rmap() + */ +int cpu_rmap_put(struct cpu_rmap *rmap) +{ + return kref_put(&rmap->refcount, cpu_rmap_release); +} +EXPORT_SYMBOL(cpu_rmap_put); + /* Reevaluate nearest object for given CPU, comparing with the given * neighbours at the given distance. */ @@ -197,8 +227,7 @@ struct irq_glue { * free_irq_cpu_rmap - free a CPU affinity reverse-map used for IRQs * @rmap: Reverse-map allocated with alloc_irq_cpu_map(), or %NULL * - * Must be called in process context, before freeing the IRQs, and - * without holding any locks required by global workqueue items. + * Must be called in process context, before freeing the IRQs. */ void free_irq_cpu_rmap(struct cpu_rmap *rmap) { @@ -212,12 +241,18 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap) glue = rmap->obj[index]; irq_set_affinity_notifier(glue->notify.irq, NULL); } - irq_run_affinity_notifiers(); - kfree(rmap); + cpu_rmap_put(rmap); } EXPORT_SYMBOL(free_irq_cpu_rmap); +/** + * irq_cpu_rmap_notify - callback for IRQ subsystem when IRQ affinity updated + * @notify: struct irq_affinity_notify passed by irq/manage.c + * @mask: cpu mask for new SMP affinity + * + * This is executed in workqueue context. + */ static void irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) { @@ -230,10 +265,16 @@ irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) pr_warning("irq_cpu_rmap_notify: update failed: %d\n", rc); } +/** + * irq_cpu_rmap_release - reclaiming callback for IRQ subsystem + * @ref: kref to struct irq_affinity_notify passed by irq/manage.c + */ static void irq_cpu_rmap_release(struct kref *ref) { struct irq_glue *glue = container_of(ref, struct irq_glue, notify.kref); + + cpu_rmap_put(glue->rmap); kfree(glue); } @@ -258,10 +299,13 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq) glue->notify.notify = irq_cpu_rmap_notify; glue->notify.release = irq_cpu_rmap_release; glue->rmap = rmap; + cpu_rmap_get(rmap); glue->index = cpu_rmap_add(rmap, glue); rc = irq_set_affinity_notifier(irq, &glue->notify); - if (rc) + if (rc) { + cpu_rmap_put(glue->rmap); kfree(glue); + } return rc; } EXPORT_SYMBOL(irq_cpu_rmap_add); -- cgit v1.2.3 From 1b963c81b14509e330e0fe3218b645ece2738dc5 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 11 Jan 2013 14:31:56 -0800 Subject: lockdep, rwsem: provide down_write_nest_lock() down_write_nest_lock() provides a means to annotate locking scenario where an outer lock is guaranteed to serialize the order nested locks are being acquired. This is analogoue to already existing mutex_lock_nest_lock() and spin_lock_nest_lock(). Signed-off-by: Jiri Kosina Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Mel Gorman Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 3 +++ include/linux/rwsem.h | 9 +++++++++ kernel/rwsem.c | 10 ++++++++++ 3 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 00e46376e28f..2bca44b0893c 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -524,14 +524,17 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, NULL, i) # else # define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i) +# define rwsem_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) # define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, NULL, i) # endif # define rwsem_release(l, n, i) lock_release(l, n, i) #else # define rwsem_acquire(l, s, t, i) do { } while (0) +# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0) # define rwsem_acquire_read(l, s, t, i) do { } while (0) # define rwsem_release(l, n, i) do { } while (0) #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 54bd7cd7ecbd..413cc11e414a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -125,8 +125,17 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); + +# define down_write_nest_lock(sem, nest_lock) \ +do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ +} while (0); + #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_write_nest_lock(sem, nest_lock) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) #endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 6850f53e02d8..b3c6c3fcd847 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); -- cgit v1.2.3 From 7b9205bd775afc4439ed86d617f9042ee9e76a71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Jan 2013 14:32:05 -0800 Subject: audit: create explicit AUDIT_SECCOMP event type The seccomp path was using AUDIT_ANOM_ABEND from when seccomp mode 1 could only kill a process. While we still want to make sure an audit record is forced on a kill, this should use a separate record type since seccomp mode 2 introduces other behaviors. In the case of "handled" behaviors (process wasn't killed), only emit a record if the process is under inspection. This change also fixes userspace examination of seccomp audit events, since it was considered malformed due to missing fields of the AUDIT_ANOM_ABEND event type. Signed-off-by: Kees Cook Cc: Al Viro Cc: Eric Paris Cc: Jeff Layton Cc: "Eric W. Biederman" Cc: Julien Tinnes Acked-by: Will Drewry Acked-by: Steve Grubb Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 3 ++- include/uapi/linux/audit.h | 1 + kernel/auditsc.c | 14 +++++++++++--- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index bce729afbcf9..9d5104d7aba9 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -157,7 +157,8 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (unlikely(!audit_dummy_context())) + /* Force a record to be reported if a signal was delivered. */ + if (signr || unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); } diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 76352ac45f24..09a2d94ab113 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -106,6 +106,7 @@ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ #define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ #define AUDIT_NETFILTER_CFG 1325 /* Netfilter chain modifications */ +#define AUDIT_SECCOMP 1326 /* Secure Computing event */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e37e6a12c5e3..3e46d1dec613 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } -static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; @@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); +} + +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + audit_log_task(ab); audit_log_format(ab, " reason="); audit_log_string(ab, reason); audit_log_format(ab, " sig=%ld", signr); @@ -2723,8 +2728,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", signr); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP); + if (unlikely(!ab)) + return; + audit_log_task(ab); + audit_log_format(ab, " sig=%ld", signr); audit_log_format(ab, " syscall=%ld", syscall); audit_log_format(ab, " compat=%d", is_compat_task()); audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); -- cgit v1.2.3 From c0a3a20b6c4b5229ef5d26fd9b1c4b1957632aa7 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 11 Jan 2013 14:32:13 -0800 Subject: linux/audit.h: move ptrace.h include to kernel header While the kernel internals want pt_regs (and so it includes linux/ptrace.h), the user version of audit.h does not need it. So move the include out of the uapi version. This avoids issues where people want the audit defines and userland ptrace api. Including both the kernel ptrace and the userland ptrace headers can easily lead to failure. Signed-off-by: Mike Frysinger Cc: Eric Paris Cc: Al Viro Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/audit.h | 1 + include/uapi/linux/audit.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d5104d7aba9..5a6d718adf34 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -24,6 +24,7 @@ #define _LINUX_AUDIT_H_ #include +#include #include struct audit_sig_info { diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 09a2d94ab113..9f096f1c0907 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -26,7 +26,6 @@ #include #include -#include /* The netlink messages for the audit system is divided into blocks: * 1000 - 1099 are for commanding the audit system -- cgit v1.2.3 From 8fb74b9fb2b182d54beee592350d9ea1f325917a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 11 Jan 2013 14:32:16 -0800 Subject: mm: compaction: partially revert capture of suitable high-order page Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when waiting for POLLIN on a local TCP socket. It was easier to trigger if there was disk IO and dirty pages at the same time and he bisected it to commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). The intention of that patch was to improve high-order allocations under memory pressure after changes made to reclaim in 3.6 drastically hurt THP allocations but the approach was flawed. For Eric, the problem was that page->pfmemalloc was not being cleared for captured pages leading to a poor interaction with swap-over-NFS support causing the packets to be dropped. However, I identified a few more problems with the patch including the fact that it can increase contention on zone->lock in some cases which could result in async direct compaction being aborted early. In retrospect the capture patch took the wrong approach. What it should have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it was allocating for THP and avoided races that way. While the patch was showing to improve allocation success rates at the time, the benefit is marginal given the relative complexity and it should be revisited from scratch in the context of the other reclaim-related changes that have taken place since the patch was first written and tested. This patch partially reverts commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page immediately when it is made available"). Reported-and-tested-by: Eric Wong Tested-by: Eric Dumazet Cc: Signed-off-by: Mel Gorman Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 +- include/linux/mm.h | 1 - mm/compaction.c | 92 +++++++--------------------------------------- mm/internal.h | 1 - mm/page_alloc.c | 35 ++++-------------- 5 files changed, 23 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 6ecb6dc2f303..cc7bddeaf553 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync, bool *contended, struct page **page); + bool sync, bool *contended); extern int compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -75,7 +75,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { return COMPACT_CONTINUE; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 63204078f72b..66e2f7c61e5c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -455,7 +455,6 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); int split_free_page(struct page *page); -int capture_free_page(struct page *page, int alloc_order, int migratetype); /* * Compound pages have a destructor function. Provide a diff --git a/mm/compaction.c b/mm/compaction.c index f8f5c111b7d7..c62bd063d766 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -816,6 +816,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, static int compact_finished(struct zone *zone, struct compact_control *cc) { + unsigned int order; unsigned long watermark; if (fatal_signal_pending(current)) @@ -850,22 +851,16 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ - if (cc->page) { - /* Was a suitable page captured? */ - if (*cc->page) + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; - } else { - unsigned int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; - /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) - return COMPACT_PARTIAL; - - /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) - return COMPACT_PARTIAL; - } } return COMPACT_CONTINUE; @@ -921,60 +916,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) return COMPACT_CONTINUE; } -static void compact_capture_page(struct compact_control *cc) -{ - unsigned long flags; - int mtype, mtype_low, mtype_high; - - if (!cc->page || *cc->page) - return; - - /* - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP - * regardless of the migratetype of the freelist is is captured from. - * This is fine because the order for a high-order MIGRATE_MOVABLE - * allocation is typically at least a pageblock size and overall - * fragmentation is not impaired. Other allocation types must - * capture pages from their own migratelist because otherwise they - * could pollute other pageblocks like MIGRATE_MOVABLE with - * difficult to move pages and making fragmentation worse overall. - */ - if (cc->migratetype == MIGRATE_MOVABLE) { - mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; - } else { - mtype_low = cc->migratetype; - mtype_high = cc->migratetype + 1; - } - - /* Speculatively examine the free lists without zone lock */ - for (mtype = mtype_low; mtype < mtype_high; mtype++) { - int order; - for (order = cc->order; order < MAX_ORDER; order++) { - struct page *page; - struct free_area *area; - area = &(cc->zone->free_area[order]); - if (list_empty(&area->free_list[mtype])) - continue; - - /* Take the lock and attempt capture of the page */ - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) - return; - if (!list_empty(&area->free_list[mtype])) { - page = list_entry(area->free_list[mtype].next, - struct page, lru); - if (capture_free_page(page, cc->order, mtype)) { - spin_unlock_irqrestore(&cc->zone->lock, - flags); - *cc->page = page; - return; - } - } - spin_unlock_irqrestore(&cc->zone->lock, flags); - } - } -} - static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; @@ -1054,9 +995,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } - - /* Capture a page now if it is a suitable size */ - compact_capture_page(cc); } out: @@ -1069,8 +1007,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, bool *contended, - struct page **page) + bool sync, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1080,7 +1017,6 @@ static unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .page = page, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1110,7 +1046,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended, struct page **page) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1136,7 +1072,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, int status; status = compact_zone_order(zone, order, gfp_mask, sync, - contended, page); + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -1192,7 +1128,6 @@ int compact_pgdat(pg_data_t *pgdat, int order) struct compact_control cc = { .order = order, .sync = false, - .page = NULL, }; return __compact_pgdat(pgdat, &cc); @@ -1203,7 +1138,6 @@ static int compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, - .page = NULL, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index d597f94cc205..9ba21100ebf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -135,7 +135,6 @@ struct compact_control { int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; bool contended; /* True if a lock was contended */ - struct page **page; /* Page captured of requested size */ }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c957805a7f0e..df2022ff0c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1384,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) set_page_refcounted(page + i); } -/* - * Similar to the split_page family of functions except that the page - * required at the given order and being isolated now to prevent races - * with parallel allocators - */ -int capture_free_page(struct page *page, int alloc_order, int migratetype) +static int __isolate_free_page(struct page *page, unsigned int order) { - unsigned int order; unsigned long watermark; struct zone *zone; int mt; @@ -1399,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) BUG_ON(!PageBuddy(page)); zone = page_zone(page); - order = page_order(page); mt = get_pageblock_migratetype(page); if (mt != MIGRATE_ISOLATE) { @@ -1408,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + __mod_zone_freepage_state(zone, -(1UL << order), mt); } /* Remove page from free list */ @@ -1416,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone->free_area[order].nr_free--; rmv_page_order(page); - if (alloc_order != order) - expand(zone, page, alloc_order, order, - &zone->free_area[order], migratetype); - - /* Set the pageblock if the captured page is at least a pageblock */ + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -1431,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) } } - return 1UL << alloc_order; + return 1UL << order; } /* @@ -1449,10 +1438,9 @@ int split_free_page(struct page *page) unsigned int order; int nr_pages; - BUG_ON(!PageBuddy(page)); order = page_order(page); - nr_pages = capture_free_page(page, order, 0); + nr_pages = __isolate_free_page(page, order); if (!nr_pages) return 0; @@ -2136,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page = NULL; - if (!order) return NULL; @@ -2149,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration, - contended_compaction, &page); + contended_compaction); current->flags &= ~PF_MEMALLOC; - /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); - goto got_page; - } - if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; + /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); put_cpu(); @@ -2168,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, alloc_flags & ~ALLOC_NO_WATERMARKS, preferred_zone, migratetype); if (page) { -got_page: preferred_zone->compact_blockskip_flush = false; preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; -- cgit v1.2.3 From 3cb7a56344ca45ee56d71c5f8fe9f922306bff1f Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Fri, 11 Jan 2013 14:32:20 -0800 Subject: lib/rbtree.c: avoid the use of non-static __always_inline lib/rbtree.c declared __rb_erase_color() as __always_inline void, and then exported it with EXPORT_SYMBOL. This was because __rb_erase_color() must be exported for augmented rbtree users, but it must also be inlined into rb_erase() so that the dummy callback can get optimized out of that call site. (Actually with a modern compiler, none of the dummy callback functions should even be generated as separate text functions). The above usage is legal C, but it was unusual enough for some compilers to warn about it. This change makes things more explicit, with a static __always_inline ____rb_erase_color function for use in rb_erase(), and a separate non-inline __rb_erase_color function for use in rb_erase_augmented call sites. Signed-off-by: Michel Lespinasse Reported-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 14 +++++++++++--- lib/rbtree.c | 20 +++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 2ac60c9cf644..fea49b5da12a 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -123,9 +123,9 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); -static __always_inline void -rb_erase_augmented(struct rb_node *node, struct rb_root *root, - const struct rb_augment_callbacks *augment) +static __always_inline struct rb_node * +__rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right, *tmp = node->rb_left; struct rb_node *parent, *rebalance; @@ -217,6 +217,14 @@ rb_erase_augmented(struct rb_node *node, struct rb_root *root, } augment->propagate(tmp, NULL); + return rebalance; +} + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } diff --git a/lib/rbtree.c b/lib/rbtree.c index 4f56a11d67fa..c0e31fe2fabf 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -194,8 +194,12 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } } -__always_inline void -__rb_erase_color(struct rb_node *parent, struct rb_root *root, +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; @@ -355,6 +359,13 @@ __rb_erase_color(struct rb_node *parent, struct rb_root *root, } } } + +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) +{ + ____rb_erase_color(parent, root, augment_rotate); +} EXPORT_SYMBOL(__rb_erase_color); /* @@ -380,7 +391,10 @@ EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { - rb_erase_augmented(node, root, &dummy_callbacks); + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); } EXPORT_SYMBOL(rb_erase); -- cgit v1.2.3 From d07d7507bfb4e23735c9b83e397c43e1e8a173e8 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 10 Jan 2013 23:19:10 +0000 Subject: net, wireless: overwrite default_ethtool_ops Since: commit 2c60db037034d27f8c636403355d52872da92f81 Author: Eric Dumazet Date: Sun Sep 16 09:17:26 2012 +0000 net: provide a default dev->ethtool_ops wireless core does not correctly assign ethtool_ops. After alloc_netdev*() call, some cfg80211 drivers provide they own ethtool_ops, but some do not. For them, wireless core provide generic cfg80211_ethtool_ops, which is assigned in NETDEV_REGISTER notify call: if (!dev->ethtool_ops) dev->ethtool_ops = &cfg80211_ethtool_ops; But after Eric's commit, dev->ethtool_ops is no longer NULL (on cfg80211 drivers without custom ethtool_ops), but points to &default_ethtool_ops. In order to fix the problem, provide function which will overwrite default_ethtool_ops and use it by wireless core. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Acked-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 8 ++++++++ net/wireless/core.c | 3 +-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c599e4782d45..9ef07d0868b6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -60,6 +60,9 @@ struct wireless_dev; #define SET_ETHTOOL_OPS(netdev,ops) \ ( (netdev)->ethtool_ops = (ops) ) +extern void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops); + /* hardware address assignment types */ #define NET_ADDR_PERM 0 /* address is permanent (default) */ #define NET_ADDR_RANDOM 1 /* address is generated randomly */ diff --git a/net/core/dev.c b/net/core/dev.c index 515473ee52cb..f64e439b4a00 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6121,6 +6121,14 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) static const struct ethtool_ops default_ethtool_ops; +void netdev_set_default_ethtool_ops(struct net_device *dev, + const struct ethtool_ops *ops) +{ + if (dev->ethtool_ops == &default_ethtool_ops) + dev->ethtool_ops = ops; +} +EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for diff --git a/net/wireless/core.c b/net/wireless/core.c index 14d990400354..b677eab55b68 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -866,8 +866,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; - if (!dev->ethtool_ops) - dev->ethtool_ops = &cfg80211_ethtool_ops; + netdev_set_default_ethtool_ops(dev, &cfg80211_ethtool_ops); if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || -- cgit v1.2.3 From 8aef33a7cf40ca9da188e8578b2abe7267a38c52 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 15 Jan 2013 14:18:04 +0100 Subject: cpuidle: remove the power_specified field in the driver We realized that the power usage field is never filled and when it is filled for tegra, the power_specified flag is not set causing all of these values to be reset when the driver is initialized with set_power_state(). However, the power_specified flag can be simply removed under the assumption that the states are always backward sorted, which is the case with the current code. This change allows the menu governor select function and the cpuidle_play_dead() to be simplified. Moreover, the set_power_states() function can removed as it does not make sense any more. Drop the power_specified flag from struct cpuidle_driver and make the related changes as described above. As a consequence, this also fixes the bug where on the dynamic C-states system, the power fields are not initialized. [rjw: Changelog] References: https://bugzilla.kernel.org/show_bug.cgi?id=42870 References: https://bugzilla.kernel.org/show_bug.cgi?id=43349 References: https://lkml.org/lkml/2012/10/16/518 Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 17 ++++------------- drivers/cpuidle/driver.c | 25 ------------------------- drivers/cpuidle/governors/menu.c | 8 ++------ include/linux/cpuidle.h | 2 +- 4 files changed, 7 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index fb4a7dd57f94..e1f6860e069c 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -69,24 +69,15 @@ int cpuidle_play_dead(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); - int i, dead_state = -1; - int power_usage = INT_MAX; + int i; if (!drv) return -ENODEV; /* Find lowest-power state that supports long-term idle */ - for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { - struct cpuidle_state *s = &drv->states[i]; - - if (s->power_usage < power_usage && s->enter_dead) { - power_usage = s->power_usage; - dead_state = i; - } - } - - if (dead_state != -1) - return drv->states[dead_state].enter_dead(dev, dead_state); + for (i = drv->state_count - 1; i >= CPUIDLE_DRIVER_STATE_START; i--) + if (drv->states[i].enter_dead) + return drv->states[i].enter_dead(dev, i); return -ENODEV; } diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index c2b281afe0ed..422c7b69ba7c 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -19,34 +19,9 @@ DEFINE_SPINLOCK(cpuidle_driver_lock); static void __cpuidle_set_cpu_driver(struct cpuidle_driver *drv, int cpu); static struct cpuidle_driver * __cpuidle_get_cpu_driver(int cpu); -static void set_power_states(struct cpuidle_driver *drv) -{ - int i; - - /* - * cpuidle driver should set the drv->power_specified bit - * before registering if the driver provides - * power_usage numbers. - * - * If power_specified is not set, - * we fill in power_usage with decreasing values as the - * cpuidle code has an implicit assumption that state Cn - * uses less power than C(n-1). - * - * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned - * an power value of -1. So we use -2, -3, etc, for other - * c-states. - */ - for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) - drv->states[i].power_usage = -1 - i; -} - static void __cpuidle_driver_init(struct cpuidle_driver *drv) { drv->refcnt = 0; - - if (!drv->power_specified) - set_power_states(drv); } static int __cpuidle_register_driver(struct cpuidle_driver *drv, int cpu) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 20ea33afdda1..fe343a06b7da 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -312,7 +312,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &__get_cpu_var(menu_devices); int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); - int power_usage = INT_MAX; int i; int multiplier; struct timespec t; @@ -383,11 +382,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (s->exit_latency * multiplier > data->predicted_us) continue; - if (s->power_usage < power_usage) { - power_usage = s->power_usage; - data->last_state_idx = i; - data->exit_us = s->exit_latency; - } + data->last_state_idx = i; + data->exit_us = s->exit_latency; } /* not deepest C-state chosen for low predicted residency */ diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 3711b34dc4f9..24cd1037b6d6 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -126,9 +126,9 @@ struct cpuidle_driver { struct module *owner; int refcnt; - unsigned int power_specified:1; /* set to 1 to use the core cpuidle time keeping (for all states). */ unsigned int en_core_tk_irqen:1; + /* states array must be ordered in decreasing power consumption */ struct cpuidle_state states[CPUIDLE_STATE_MAX]; int state_count; int safe_state_index; -- cgit v1.2.3 From 774a1221e862b343388347bac9b318767336b20b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Jan 2013 18:52:51 -0800 Subject: module, async: async_synchronize_full() on module init iff async is used If the default iosched is built as module, the kernel may deadlock while trying to load the iosched module on device probe if the probing was running off async. This is because async_synchronize_full() at the end of module init ends up waiting for the async job which initiated the module loading. async A modprobe 1. finds a device 2. registers the block device 3. request_module(default iosched) 4. modprobe in userland 5. load and init module 6. async_synchronize_full() Async A waits for modprobe to finish in request_module() and modprobe waits for async A to finish in async_synchronize_full(). Because there's no easy to track dependency once control goes out to userland, implementing properly nested flushing is difficult. For now, make module init perform async_synchronize_full() iff module init has queued async jobs as suggested by Linus. This avoids the described deadlock because iosched module doesn't use async and thus wouldn't invoke async_synchronize_full(). This is hacky and incomplete. It will deadlock if async module loading nests; however, this works around the known problem case and seems to be the best of bad options. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Tested-by: Ming Lei Tested-by: Alex Riesen Cc: Arjan van de Ven Cc: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/async.c | 3 +++ kernel/module.c | 27 +++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 206bb089c06b..6fc8f45de4e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1810,6 +1810,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..a1d585c351d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -196,6 +196,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..b10b048367e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3013,6 +3013,12 @@ static int do_init_module(struct module *mod) { int ret = 0; + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -3058,8 +3064,25 @@ static int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3 From e65b9ad222c280c031bc8d3642cc38dd3026fe06 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 15 Jan 2013 20:12:37 +0100 Subject: lockdep, rwsem: fix down_write_nest_lock() if !CONFIG_DEBUG_LOCK_ALLOC Commit 1b963c81b145 ("lockdep, rwsem: provide down_write_nest_lock()") contains a bug in a codepath when CONFIG_DEBUG_LOCK_ALLOC is disabled, which causes down_read() to be called instead of down_write() by mistake on such configurations. Fix that. Reported-and-tested-by: Andrew Clayton Reported-and-tested-by: Zlatko Calusic Signed-off-by: Jiri Kosina Reviewed-by: Rik van Riel Signed-off-by: Linus Torvalds --- include/linux/rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 413cc11e414a..8da67d625e13 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -135,7 +135,7 @@ do { \ #else # define down_read_nested(sem, subclass) down_read(sem) -# define down_write_nest_lock(sem, nest_lock) down_read(sem) +# define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) #endif -- cgit v1.2.3