From d0164adc89f6bb374d304ffcc375c6d2652fe67d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:21 -0800 Subject: mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 6 +++--- kernel/cgroup.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/smp.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a32ded7..5ffcbd354a52 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_DIRECT_RECLAIM) { if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce3f9ce..f1603c153890 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4c9952..deae3907ac1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..d903c02223af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); -- cgit v1.2.3 From 71baba4b92dc1fa1bc461742c6ab1942ec6034e9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 6 Nov 2015 16:28:28 -0800 Subject: mm, page_alloc: rename __GFP_WAIT to __GFP_RECLAIM __GFP_WAIT was used to signal that the caller was in atomic context and could not sleep. Now it is possible to distinguish between true atomic context and callers that are not willing to sleep. The latter should clear __GFP_DIRECT_RECLAIM so kswapd will still wake. As clearing __GFP_WAIT behaves differently, there is a risk that people will clear the wrong flags. This patch renames __GFP_WAIT to __GFP_RECLAIM to clearly indicate what it does -- setting it allows all reclaim activity, clearing them prevents it. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Cc: Christoph Lameter Acked-by: David Rientjes Cc: Vitaly Wool Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-core.c | 4 ++-- block/blk-mq.c | 2 +- block/scsi_ioctl.c | 6 +++--- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/paride/pd.c | 2 +- drivers/block/pktcdvd.c | 4 ++-- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/ide/ide-atapi.c | 2 +- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-devsets.c | 2 +- drivers/ide/ide-disk.c | 2 +- drivers/ide/ide-ioctls.c | 4 ++-- drivers/ide/ide-park.c | 2 +- drivers/ide/ide-pm.c | 4 ++-- drivers/ide/ide-tape.c | 4 ++-- drivers/ide/ide-taskfile.c | 4 ++-- drivers/infiniband/hw/qib/qib_init.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/nvme/host/pci.c | 6 ++++-- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_lib.c | 4 ++-- drivers/staging/rdma/hfi1/init.c | 2 +- drivers/staging/rdma/ipath/ipath_file_ops.c | 2 +- fs/cachefiles/internal.h | 2 +- fs/direct-io.c | 2 +- fs/nilfs2/mdt.h | 2 +- include/linux/gfp.h | 16 ++++++++-------- kernel/power/swap.c | 16 ++++++++-------- lib/percpu_ida.c | 2 +- mm/failslab.c | 8 ++++---- mm/filemap.c | 2 +- mm/huge_memory.c | 2 +- mm/memcontrol.c | 2 +- mm/migrate.c | 2 +- mm/page_alloc.c | 9 +++++---- security/integrity/ima/ima_crypto.c | 2 +- 38 files changed, 71 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 9e32f0868e36..590cca21c24a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!(gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp)) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -2038,7 +2038,7 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { + if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { q->make_request_fn(q, bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index 68c0a3416b34..694f8703f83c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|__GFP_HIGH, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index dda653ce7b24..0774799942e0 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index e5e0f19ceda0..3dc53a16ed3a 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1007,7 +1007,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM); copy_highpage(page, b->bm_pages[page_nr]); bm_store_page_idx(page, page_nr); } else diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f504232c1ee7..a28a562f7b7f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true); return blk_mq_rq_to_pdu(rq); } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index b9242d78283d..562b5a4ca7b7 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -723,7 +723,7 @@ static int pd_special_command(struct pd_unit *disk, struct request *rq; int err = 0; - rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7be2375db7f2..5959c2981cc7 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,14 +704,14 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * int ret = 0; rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - WRITE : READ, __GFP_WAIT); + WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out; } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d58cb9e034fe..7e505d4be7c0 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2216,7 +2216,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) mapping = file_inode(obj->base.filp)->i_mapping; gfp = mapping_gfp_mask(mapping); gfp |= __GFP_NORETRY | __GFP_NOWARN; - gfp &= ~(__GFP_IO | __GFP_WAIT); + gfp &= ~(__GFP_IO | __GFP_RECLAIM); sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 1362ad80a76c..05352f490d60 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, struct request *rq; int error; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = (char *)pc; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 64a6b827b3dd..ef907fd5ba98 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -441,7 +441,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, struct request *rq; int error; - rq = blk_get_request(drive->queue, write, __GFP_WAIT); + rq = blk_get_request(drive->queue, write, __GFP_RECLAIM); memcpy(rq->cmd, cmd, BLK_MAX_CDB); rq->cmd_type = REQ_TYPE_ATA_PC; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 066e39036518..474173eb31bb 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -303,7 +303,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) struct request *rq; int ret; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_flags = REQ_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index b05a74d78ef5..0dd43b4fcec6 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -165,7 +165,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, if (!(setting->flags & DS_SYNC)) return setting->set(drive, arg); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 5; rq->cmd[0] = REQ_DEVSET_EXEC; diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 56b9708894a5..37a8a907febe 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -477,7 +477,7 @@ static int set_multcount(ide_drive_t *drive, int arg) if (drive->special_flags & IDE_SFLAG_SET_MULTMODE) return -EBUSY; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; drive->mult_req = arg; diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index aa2e9b77b20d..d05db2469209 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) if (NULL == (void *) arg) { struct request *rq; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; err = blk_execute_rq(drive->queue, NULL, rq, 0); blk_put_request(rq); @@ -221,7 +221,7 @@ static int generic_drive_reset(ide_drive_t *drive) struct request *rq; int ret = 0; - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index c80868520488..2d7dca56dd24 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -31,7 +31,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) } spin_unlock_irq(&hwif->lock); - rq = blk_get_request(q, READ, __GFP_WAIT); + rq = blk_get_request(q, READ, __GFP_RECLAIM); rq->cmd[0] = REQ_PARK_HEADS; rq->cmd_len = 1; rq->cmd_type = REQ_TYPE_DRV_PRIV; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 081e43458d50..e34af488693a 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -18,7 +18,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; @@ -88,7 +88,7 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; rq->special = &rqpm; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index f5d51d1d09ee..12fa04997dcc 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -852,7 +852,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); BUG_ON(size < 0 || size % tape->blk_size); - rq = blk_get_request(drive->queue, READ, __GFP_WAIT); + rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; @@ -860,7 +860,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) if (size) { ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out_put; } diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 0979e126fff1..a716693417a3 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -430,7 +430,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, int error; int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE; - rq = blk_get_request(drive->queue, rw, __GFP_WAIT); + rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; /* @@ -441,7 +441,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, */ if (nsect) { error = blk_rq_map_kern(drive->queue, rq, buf, - nsect * SECTOR_SIZE, __GFP_WAIT); + nsect * SECTOR_SIZE, __GFP_RECLAIM); if (error) goto put_req; } diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 7e00470adc30..4ff340fe904f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1680,7 +1680,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = rcd->rcvegrcnt; egroff = rcd->rcvegr_tid_base; diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 89300870fefb..1e688bfec567 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -75,7 +75,7 @@ MODULE_LICENSE("GPL"); /* * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't - * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use + * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use * __GFP_NOWARN, to suppress page allocation failure warnings. */ #define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e878590e71b6..6c195554d94a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1025,11 +1025,13 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->special = (void *)0; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_kern(q, req, buffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + __GFP_DIRECT_RECLAIM); if (ret) goto out; bio = req->bio; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 66a96cd98b97..984ddcb4786d 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1970,7 +1970,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) struct request *req; /* - * blk_get_request with GFP_KERNEL (__GFP_WAIT) sleeps until a + * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a * request becomes available */ req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 126a48c6431e..dd8ad2a44510 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -222,13 +222,13 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int write = (data_direction == DMA_TO_DEVICE); int ret = DRIVER_ERROR << 24; - req = blk_get_request(sdev->request_queue, write, __GFP_WAIT); + req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM); if (IS_ERR(req)) return ret; blk_rq_set_block_pc(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, - buffer, bufflen, __GFP_WAIT)) + buffer, bufflen, __GFP_RECLAIM)) goto out; req->cmd_len = COMMAND_SIZE(cmd[0]); diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index 47a1202fcbdf..8666f3ad24e9 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -1560,7 +1560,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; /* * The minimum size of the eager buffers is a groups of MTU-sized diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c index 5d9b9dbd8fc4..13c3cd11ab92 100644 --- a/drivers/staging/rdma/ipath/ipath_file_ops.c +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -905,7 +905,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) * heavy filesystem activity makes these fail, and we can * use compound pages. */ - gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; egrcnt = dd->ipath_rcvegrcnt; /* TID number offset for this port */ diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index aecd0859eacb..9c4b737a54df 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -30,7 +30,7 @@ extern unsigned cachefiles_debug; #define CACHEFILES_DEBUG_KLEAVE 2 #define CACHEFILES_DEBUG_KDEBUG 4 -#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC) +#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC) /* * node records diff --git a/fs/direct-io.c b/fs/direct-io.c index 3ae0e0427191..18e7554cf94c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -361,7 +361,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, /* * bio_alloc() is guaranteed to return a bio when called with - * __GFP_WAIT and we request a valid number of vectors. + * __GFP_RECLAIM and we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index fe529a87a208..03246cac3338 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) } /* Default GFP flags using highmem */ -#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) +#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) int nilfs_mdt_get_block(struct inode *, unsigned long, int, void (*init_block)(struct inode *, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 86f9f7da86ea..369227202ac2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -107,7 +107,7 @@ struct vm_area_struct; * can be cleared when the reclaiming of pages would cause unnecessary * disruption. */ -#define __GFP_WAIT ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ @@ -126,12 +126,12 @@ struct vm_area_struct; */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) -#define GFP_NOIO (__GFP_WAIT) -#define GFP_NOFS (__GFP_WAIT | __GFP_IO) -#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ +#define GFP_NOIO (__GFP_RECLAIM) +#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) +#define GFP_TEMPORARY (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \ __GFP_RECLAIMABLE) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ @@ -143,12 +143,12 @@ struct vm_area_struct; #define GFP_MOVABLE_SHIFT 3 /* Control page allocator reclaim behavior */ -#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) /* Control slab gfp mask during early boot */ -#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5b10f..12cd989dadf6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index f75715131f20..6d40944960de 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -135,7 +135,7 @@ static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course). * * @gfp indicates whether or not to wait until a free id is available (it's not - * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep + * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep * however long it takes until another thread frees an id (same semantics as a * mempool). * diff --git a/mm/failslab.c b/mm/failslab.c index 98fb490311eb..79171b4a5826 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -3,11 +3,11 @@ static struct { struct fault_attr attr; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .cache_filter = false, }; @@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) @@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/mm/filemap.c b/mm/filemap.c index 58e04e26f996..6ef3674c0763 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2713,7 +2713,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f5c08b46fef8..9812d4618651 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -786,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 05374f09339c..a5470674a477 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2120,7 +2120,7 @@ done_restock: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here - * if __GFP_WAIT but let's always punt for simplicity and so that + * if __GFP_RECLAIM but let's always punt for simplicity and so that * GFP_KERNEL can consistently be used during reclaim. @memcg is * not recorded as it most likely matches current's and won't * change in the meantime. As high limit is checked again before diff --git a/mm/migrate.c b/mm/migrate.c index e60379eb23f8..7890d0bb5e23 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1752,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70461f3e3378..1b373096b990 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2160,11 +2160,11 @@ static struct { struct fault_attr attr; bool ignore_gfp_highmem; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2183,7 +2183,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_DIRECT_RECLAIM)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2202,7 +2203,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index e24121afb2f2..6eb62936c672 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -126,7 +126,7 @@ static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size, { void *ptr; int order = ima_maxorder; - gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY; + gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY; if (order) order = min(get_order(max_size), order); -- cgit v1.2.3 From 3d9c637f4ae74b45d95bb6cbd793fbffad0a709c Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 6 Nov 2015 16:29:12 -0800 Subject: module: export param_free_charp() Change the param_free_charp() function from static to exported. It is used by zswap in the next patch ("zswap: use charp for zswap param strings"). Signed-off-by: Dan Streetman Acked-by: Rusty Russell Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/moduleparam.h | 1 + kernel/params.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index c12f2147c350..52666d90ca94 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -386,6 +386,7 @@ extern int param_get_ullong(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_charp; extern int param_set_charp(const char *val, const struct kernel_param *kp); extern int param_get_charp(char *buffer, const struct kernel_param *kp); +extern void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..93a380a2345d 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -325,10 +325,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, -- cgit v1.2.3 From 3824657c522f19f85a76bd932821174a5557a382 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 6 Nov 2015 16:30:38 -0800 Subject: printk: prevent userland from spoofing kernel messages The following statement of ABI/testing/dev-kmsg is not quite right: It is not possible to inject messages from userspace with the facility number LOG_KERN (0), to make sure that the origin of the messages can always be reliably determined. Userland actually can inject messages with a facility of 0 by abusing the fact that the facility is stored in a u8 data type. By using a facility which is a multiple of 256 the assignment of msg->facility in log_store() implicitly truncates it to 0, i.e. LOG_KERN, allowing users of /dev/kmsg to spoof kernel messages as shown below: The following call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0 >/dev/kmsg ...leads to the following log entry (dmesg -x | tail -n 1): user :emerg : [ 66.137758] Kernel panic - not syncing: beer empty However, this call... # printf '<%d>Kernel panic - not syncing: beer empty\n' 0x800 >/dev/kmsg ...leads to the slightly different log entry (note the kernel facility): kern :emerg : [ 74.177343] Kernel panic - not syncing: beer empty Fix that by limiting the user provided facility to 8 bit right from the beginning and catch the truncation early. Fixes: 7ff9554bb578 ("printk: convert byte-buffer to variable-length...") Signed-off-by: Mathias Krause Cc: Greg Kroah-Hartman Cc: Petr Mladek Cc: Alex Elder Cc: Joe Perches Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f35487b67..2ce8826f1053 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,6 +269,9 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 @@ -612,7 +615,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; -- cgit v1.2.3 From 2e01fabe67ccaff1d59bda01e60a61f5fb0aa7b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:19 -0800 Subject: signals: kill block_all_signals() and unblock_all_signals() It is hardly possible to enumerate all problems with block_all_signals() and unblock_all_signals(). Just for example, 1. block_all_signals(SIGSTOP/etc) simply can't help if the caller is multithreaded. Another thread can dequeue the signal and force the group stop. 2. Even is the caller is single-threaded, it will "stop" anyway. It will not sleep, but it will spin in kernel space until SIGCONT or SIGKILL. And a lot more. In short, this interface doesn't work at all, at least the last 10+ years. Daniel said: Yeah the only times I played around with the DRM_LOCK stuff was when old drivers accidentally deadlocked - my impression is that the entire DRM_LOCK thing was never really tested properly ;-) Hence I'm all for purging where this leaks out of the drm subsystem. Signed-off-by: Oleg Nesterov Acked-by: Daniel Vetter Acked-by: Dave Airlie Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_lock.c | 41 ------------------------------------- include/drm/drmP.h | 1 - include/linux/sched.h | 7 +------ kernel/signal.c | 51 +--------------------------------------------- 4 files changed, 2 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/drm_lock.c b/drivers/gpu/drm/drm_lock.c index 4924d381b664..daa2ff12101b 100644 --- a/drivers/gpu/drm/drm_lock.c +++ b/drivers/gpu/drm/drm_lock.c @@ -38,8 +38,6 @@ #include "drm_legacy.h" #include "drm_internal.h" -static int drm_notifier(void *priv); - static int drm_lock_take(struct drm_lock_data *lock_data, unsigned int context); /** @@ -118,14 +116,8 @@ int drm_legacy_lock(struct drm_device *dev, void *data, * really probably not the correct answer but lets us debug xkb * xserver for now */ if (!file_priv->is_master) { - sigemptyset(&dev->sigmask); - sigaddset(&dev->sigmask, SIGSTOP); - sigaddset(&dev->sigmask, SIGTSTP); - sigaddset(&dev->sigmask, SIGTTIN); - sigaddset(&dev->sigmask, SIGTTOU); dev->sigdata.context = lock->context; dev->sigdata.lock = master->lock.hw_lock; - block_all_signals(drm_notifier, dev, &dev->sigmask); } if (dev->driver->dma_quiescent && (lock->flags & _DRM_LOCK_QUIESCENT)) @@ -169,7 +161,6 @@ int drm_legacy_unlock(struct drm_device *dev, void *data, struct drm_file *file_ /* FIXME: Should really bail out here. */ } - unblock_all_signals(); return 0; } @@ -287,38 +278,6 @@ int drm_legacy_lock_free(struct drm_lock_data *lock_data, unsigned int context) return 0; } -/** - * If we get here, it means that the process has called DRM_IOCTL_LOCK - * without calling DRM_IOCTL_UNLOCK. - * - * If the lock is not held, then let the signal proceed as usual. If the lock - * is held, then set the contended flag and keep the signal blocked. - * - * \param priv pointer to a drm_device structure. - * \return one if the signal should be delivered normally, or zero if the - * signal should be blocked. - */ -static int drm_notifier(void *priv) -{ - struct drm_device *dev = priv; - struct drm_hw_lock *lock = dev->sigdata.lock; - unsigned int old, new, prev; - - /* Allow signal delivery if lock isn't held */ - if (!lock || !_DRM_LOCK_IS_HELD(lock->lock) - || _DRM_LOCKING_CONTEXT(lock->lock) != dev->sigdata.context) - return 1; - - /* Otherwise, set flag to force call to - drmUnlock */ - do { - old = lock->lock; - new = old | _DRM_LOCK_CONT; - prev = cmpxchg(&lock->lock, old, new); - } while (prev != old); - return 0; -} - /** * This function returns immediately and takes the hw lock * with the kernel context if it is free, otherwise it gets the highest priority when and if diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 8b5ce7c5d9bb..f56cdcecc1c9 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -822,7 +822,6 @@ struct drm_device { struct drm_sg_mem *sg; /**< Scatter gather memory */ unsigned int num_crtcs; /**< Number of CRTCs on this device */ - sigset_t sigmask; struct { int context; diff --git a/include/linux/sched.h b/include/linux/sched.h index eeb5066a44fb..923ec1a9b2b4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1570,9 +1570,7 @@ struct task_struct { unsigned long sas_ss_sp; size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; + struct callback_head *task_works; struct audit_context *audit_context; @@ -2476,9 +2474,6 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s return ret; } -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); extern void release_task(struct task_struct * p); extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe77b46..f2cbd4ed5cd4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. -- cgit v1.2.3 From 5fa534c987784c4811757a34c425aff3ce3b5037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Nov 2015 16:32:31 -0800 Subject: coredump: ensure all coredumping tasks have SIGNAL_GROUP_COREDUMP task_will_free_mem() is wrong in many ways, and in particular the SIGNAL_GROUP_COREDUMP check is not reliable: a task can participate in the coredumping without SIGNAL_GROUP_COREDUMP bit set. change zap_threads() paths to always set SIGNAL_GROUP_COREDUMP even if other CLONE_VM processes can't react to SIGKILL. Fortunately, at least oom-kill case if fine; it kills all tasks sharing the same mm, so it should also kill the process which actually dumps the core. The change in prepare_signal() is not strictly necessary, it just ensures that the patch does not bring another subtle behavioural change. But it reminds us that this SIGNAL_GROUP_EXIT/COREDUMP case needs more changes. Signed-off-by: Oleg Nesterov Cc: David Rientjes Cc: Kyle Walker Acked-by: Michal Hocko Cc: Stanislav Kozina Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 12 ++++++------ kernel/signal.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index a8f75640ac86..c66bb0572265 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -280,11 +280,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start, int exit_code) +static int zap_process(struct task_struct *start, int exit_code, int flags) { struct task_struct *t; int nr = 0; + /* ignore all signals except SIGKILL, see prepare_signal() */ + start->signal->flags = SIGNAL_GROUP_COREDUMP | flags; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -311,10 +313,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - nr = zap_process(tsk, exit_code); tsk->signal->group_exit_task = tsk; - /* ignore all signals except SIGKILL, see prepare_signal() */ - tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); @@ -365,8 +365,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (p->mm) { if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code); - p->signal->flags = SIGNAL_GROUP_EXIT; + nr += zap_process(p, exit_code, + SIGNAL_GROUP_EXIT); unlock_task_sighand(p, &flags); } break; diff --git a/kernel/signal.c b/kernel/signal.c index f2cbd4ed5cd4..c0b01fe24bbd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -788,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. -- cgit v1.2.3 From de90a6bcaede81f35e8caf4566d1006267230377 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 6 Nov 2015 16:32:45 -0800 Subject: kexec: use file name as the output message prefix kexec output message misses the prefix "kexec", when Dave Young split the kexec code. Now, we use file name as the output message prefix. Currently, the format of output message: [ 140.290795] SYSC_kexec_load: hello, world [ 140.291534] kexec: sanity_check_segment_list: hello, world Ideally, the format of output message: [ 30.791503] kexec: SYSC_kexec_load, Hello, world [ 79.182752] kexec_core: sanity_check_segment_list, Hello, world Remove the custom prefix "kexec" in output message. Signed-off-by: Minfei Huang Acked-by: Dave Young Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ kernel/kexec_core.c | 4 ++-- kernel/kexec_file.c | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc357923..d873b64fbddc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a03cefa..11b64a63c0f8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include @@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2a0e8e..b70ada0028d2 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v1.2.3 From 8639b46139b0e4ea3b1ab1c274e410ee327f1d89 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Fri, 6 Nov 2015 16:32:48 -0800 Subject: pidns: fix set/getpriority and ioprio_set/get in PRIO_USER mode setpriority(PRIO_USER, 0, x) will change the priority of tasks outside of the current pid namespace. This is in contrast to both the other modes of setpriority and the example of kill(-1). Fix this. getpriority and ioprio have the same failure mode, fix them too. Eric said: : After some more thinking about it this patch sounds justifiable. : : My goal with namespaces is not to build perfect isolation mechanisms : as that can get into ill defined territory, but to build well defined : mechanisms. And to handle the corner cases so you can use only : a single namespace with well defined results. : : In this case you have found the two interfaces I am aware of that : identify processes by uid instead of by pid. Which quite frankly is : weird. Unfortunately the weird unexpected cases are hard to handle : in the usual way. : : I was hoping for a little more information. Changes like this one we : have to be careful of because someone might be depending on the current : behavior. I don't think they are and I do think this make sense as part : of the pid namespace. Signed-off-by: Ben Segall Cc: Oleg Nesterov Cc: Al Viro Cc: Ambrose Feinstein Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/ioprio.c | 6 ++++-- kernel/sys.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/block/ioprio.c b/block/ioprio.c index 31666c92b46a..cc7800e9eb44 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..6af9212ab5aa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; -- cgit v1.2.3 From 08d78658f393fefaa2e6507ea052c6f8ef4002a2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 6 Nov 2015 16:32:58 -0800 Subject: panic: release stale console lock to always get the logbuf printed out In some cases we may end up killing the CPU holding the console lock while still having valuable data in logbuf. E.g. I'm observing the following: - A crash is happening on one CPU and console_unlock() is being called on some other. - console_unlock() tries to print out the buffer before releasing the lock and on slow console it takes time. - in the meanwhile crashing CPU does lots of printk()-s with valuable data (which go to the logbuf) and sends IPIs to all other CPUs. - console_unlock() finishes printing previous chunk and enables interrupts before trying to print out the rest, the CPU catches the IPI and never releases console lock. This is not the only possible case: in VT/fb subsystems we have many other console_lock()/console_unlock() users. Non-masked interrupts (or receiving NMI in case of extreme slowness) will have the same result. Getting the whole console buffer printed out on crash should be top priority. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Masami Hiramatsu Cc: Jiri Kosina Cc: Baoquan He Cc: Prarit Bhargava Cc: Xie XiuQi Cc: Seth Jennings Cc: "K. Y. Srinivasan" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff7560b..4579dbb7ed87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -147,6 +148,15 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. + */ + console_trylock(); + console_unlock(); + if (!panic_blink) panic_blink = no_blink; -- cgit v1.2.3