diff options
author | Jens Axboe <jens.axboe@oracle.com> | 2008-01-29 14:53:40 +0100 |
---|---|---|
committer | Jens Axboe <jens.axboe@oracle.com> | 2008-01-29 21:55:08 +0100 |
commit | 86db1e29772372155db08ff48a9ceb76e11a2ad1 (patch) | |
tree | 312f38eb3245873c476c50f816b85610fef9615a /block/blk-core.c | |
parent | 8324aa91d1e11a1fc25f209687a0b2e6c2ed47d0 (diff) | |
download | lwn-86db1e29772372155db08ff48a9ceb76e11a2ad1.tar.gz lwn-86db1e29772372155db08ff48a9ceb76e11a2ad1.zip |
block: continue ll_rw_blk.c splitup
Adds files for barrier handling, rq execution, io context handling,
mapping data to requests, and queue settings.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'block/blk-core.c')
-rw-r--r-- | block/blk-core.c | 1255 |
1 files changed, 8 insertions, 1247 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 937f9d0b9bd5..2c73ed1a8131 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,7 +20,6 @@ #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> @@ -34,20 +33,9 @@ #include "blk.h" -/* - * for max sense size - */ -#include <scsi/scsi_cmnd.h> - -static void blk_unplug_work(struct work_struct *work); -static void blk_unplug_timeout(unsigned long data); static void drive_stat_acct(struct request *rq, int new_io); -static void init_request_from_bio(struct request *req, struct bio *bio); static int __make_request(struct request_queue *q, struct bio *bio); -static struct io_context *current_io_context(gfp_t gfp_flags, int node); static void blk_recalc_rq_segments(struct request *rq); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio); /* * For the allocated request tables @@ -60,28 +48,12 @@ struct kmem_cache *request_cachep; struct kmem_cache *blk_requestq_cachep = NULL; /* - * For io context allocations - */ -static struct kmem_cache *iocontext_cachep; - -/* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; -unsigned long blk_max_low_pfn, blk_max_pfn; - -EXPORT_SYMBOL(blk_max_low_pfn); -EXPORT_SYMBOL(blk_max_pfn); - static DEFINE_PER_CPU(struct list_head, blk_cpu_done); -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) - -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -117,113 +89,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) } EXPORT_SYMBOL(blk_get_backing_dev_info); -/** - * blk_queue_prep_rq - set a prepare_request function for queue - * @q: queue - * @pfn: prepare_request function - * - * It's possible for a queue to register a prepare_request callback which - * is invoked before the request is handed to the request_fn. The goal of - * the function is to prepare a request for I/O, it can be used to build a - * cdb from the request data for instance. - * - */ -void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) -{ - q->prep_rq_fn = pfn; -} - -EXPORT_SYMBOL(blk_queue_prep_rq); - -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} - -EXPORT_SYMBOL(blk_queue_merge_bvec); - -void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) -{ - q->softirq_done_fn = fn; -} - -EXPORT_SYMBOL(blk_queue_softirq_done); - -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct bios to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling - * blk_queue_bounce() to create a buffer in normal memory. - **/ -void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) -{ - /* - * set defaults - */ - q->nr_requests = BLKDEV_MAX_RQ; - blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); - blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); - q->make_request_fn = mfn; - q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); - blk_queue_hardsect_size(q, 512); - blk_queue_dma_alignment(q, 511); - blk_queue_congestion_threshold(q); - q->nr_batching = BLK_BATCH_REQ; - - q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ - if (q->unplug_delay == 0) - q->unplug_delay = 1; - - INIT_WORK(&q->unplug_work, blk_unplug_work); - - q->unplug_timer.function = blk_unplug_timeout; - q->unplug_timer.data = (unsigned long)q; - - /* - * by default assume old behaviour and bounce for any highmem page - */ - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); -} - -EXPORT_SYMBOL(blk_queue_make_request); - -static void rq_init(struct request_queue *q, struct request *rq) +void rq_init(struct request_queue *q, struct request *rq) { INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->donelist); @@ -247,255 +113,6 @@ static void rq_init(struct request_queue *q, struct request *rq) rq->next_rq = NULL; } -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * @prepare_flush_fn: rq setup helper for cache flush ordered writes - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn) -{ - if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && - prepare_flush_fn == NULL) { - printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); - return -EINVAL; - } - - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - q->prepare_flush_fn = prepare_flush_fn; - - return 0; -} - -EXPORT_SYMBOL(blk_queue_ordered); - -/* - * Cache flushing for ordered writes handling - */ -inline unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (!blk_fs_request(rq)) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) -{ - struct request *rq; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - - if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) - BUG(); -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, unsigned which) -{ - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - - rq->cmd_flags = REQ_HARDBARRIER; - rq_init(q, rq); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - rq->rq_disk = q->bar_rq.rq_disk; - rq->end_io = end_io; - q->prepare_flush_fn(q, rq); - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static inline struct request *start_ordered(struct request_queue *q, - struct request *rq) -{ - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * Prep proxy barrier request. - */ - blkdev_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = &q->bar_rq; - rq->cmd_flags = 0; - rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; - if (q->ordered & QUEUE_ORDERED_FUA) - rq->cmd_flags |= REQ_FUA; - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->end_io = bar_end_io; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. If this request is - * an empty barrier, we don't need to do a postflush ever since - * there will be no data written between the pre and post flush. - * Hence a single flush will suffice. - */ - if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq)) - queue_flush(q, QUEUE_ORDERED_POSTFLUSH); - else - q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - - if (q->ordered & QUEUE_ORDERED_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_PREFLUSH); - rq = &q->pre_flush_rq; - } else - q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; - - if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) - q->ordseq |= QUEUE_ORDSEQ_DRAIN; - else - rq = NULL; - - return rq; -} - -int blk_do_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); - - if (!q->ordseq) { - if (!is_barrier) - return 1; - - if (q->next_ordered != QUEUE_ORDERED_NONE) { - *rqp = start_ordered(q, rq); - return 1; - } else { - /* - * This can happen when the queue switches to - * ORDERED_NONE while this request is on it. - */ - blkdev_dequeue_request(rq); - if (__blk_end_request(rq, -EOPNOTSUPP, - blk_rq_bytes(rq))) - BUG(); - *rqp = NULL; - return 0; - } - } - - /* - * Ordered sequence in progress - */ - - /* Special requests are not subject to ordering rules. */ - if (!blk_fs_request(rq) && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return 1; - - if (q->ordered & QUEUE_ORDERED_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } - - return 1; -} - static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { @@ -528,279 +145,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio, } } -/** - * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @dma_addr: bus address limit - * - * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @page. - **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) -{ - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* Assume anything <= 4GB can be handled by IOMMU. - Actually some IOMMUs can handle everything, but I don't - know of a way to test this here. */ - if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->bounce_pfn = max_low_pfn; -#else - if (bounce_pfn < blk_max_low_pfn) - dma = 1; - q->bounce_pfn = bounce_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->bounce_pfn = bounce_pfn; - } -} - -EXPORT_SYMBOL(blk_queue_bounce_limit); - -/** - * blk_queue_max_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit - * - * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. - **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) -{ - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); - printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); - } - - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->max_hw_sectors = q->max_sectors = max_sectors; - else { - q->max_sectors = BLK_DEF_MAX_SECTORS; - q->max_hw_sectors = max_sectors; - } -} - -EXPORT_SYMBOL(blk_queue_max_sectors); - -/** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. - **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_phys_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give as once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); - } - - q->max_hw_segments = max_segments; -} - -EXPORT_SYMBOL(blk_queue_max_hw_segments); - -/** - * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg - * @q: the request queue for the device - * @max_size: max size of segment in bytes - * - * Description: - * Enables a low level driver to set an upper limit on the size of a - * coalesced segment - **/ -void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) -{ - if (max_size < PAGE_CACHE_SIZE) { - max_size = PAGE_CACHE_SIZE; - printk("%s: set to minimum %d\n", __FUNCTION__, max_size); - } - - q->max_segment_size = max_size; -} - -EXPORT_SYMBOL(blk_queue_max_segment_size); - -/** - * blk_queue_hardsect_size - set hardware sector size for the queue - * @q: the request queue for the device - * @size: the hardware sector size, in bytes - * - * Description: - * This should typically be set to the lowest possible sector size - * that the hardware can operate on (possible without reverting to - * even internal read-modify-write operations). Usually the default - * of 512 covers most hardware. - **/ -void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) -{ - q->hardsect_size = size; -} - -EXPORT_SYMBOL(blk_queue_hardsect_size); - -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - -/** - * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers - * @t: the stacking driver (top) - * @b: the underlying device (bottom) - **/ -void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) -{ - /* zero is "infinity" */ - t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); - t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); - - t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); - t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); - t->max_segment_size = min(t->max_segment_size,b->max_segment_size); - t->hardsect_size = max(t->hardsect_size,b->hardsect_size); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); -} - -EXPORT_SYMBOL(blk_queue_stack_limits); - -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * - * @q: the request queue for the device - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, void *buf, - unsigned int size) -{ - if (q->max_hw_segments < 2 || q->max_phys_segments < 2) - return -EINVAL; - /* make room for appending the drain */ - --q->max_hw_segments; - --q->max_phys_segments; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} - -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - -/** - * blk_queue_segment_boundary - set boundary rules for segment merging - * @q: the request queue for the device - * @mask: the memory boundary mask - **/ -void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) -{ - if (mask < PAGE_CACHE_SIZE - 1) { - mask = PAGE_CACHE_SIZE - 1; - printk("%s: set to minimum %lx\n", __FUNCTION__, mask); - } - - q->seg_boundary_mask = mask; -} - -EXPORT_SYMBOL(blk_queue_segment_boundary); - -/** - * blk_queue_dma_alignment - set dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * set required memory and length aligment for direct dma transactions. - * this is used when buiding direct io requests for the queue. - * - **/ -void blk_queue_dma_alignment(struct request_queue *q, int mask) -{ - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_dma_alignment); - -/** - * blk_queue_update_dma_alignment - update dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask - * - * description: - * update required memory and length aligment for direct dma transactions. - * If the requested alignment is larger than the current alignment, then - * the current queue alignment is updated to the new value, otherwise it - * is left alone. The design of this is to allow multiple objects - * (driver, device, transport etc) to set their respective - * alignments without having them interfere. - * - **/ -void blk_queue_update_dma_alignment(struct request_queue *q, int mask) -{ - BUG_ON(mask > PAGE_SIZE); - - if (mask > q->dma_alignment) - q->dma_alignment = mask; -} - -EXPORT_SYMBOL(blk_queue_update_dma_alignment); - void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; @@ -1074,8 +418,8 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; } -static int ll_back_merge_fn(struct request_queue *q, struct request *req, - struct bio *bio) +int ll_back_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio) { unsigned short max_sectors; int len; @@ -1285,7 +629,7 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi, blk_unplug(q); } -static void blk_unplug_work(struct work_struct *work) +void blk_unplug_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, unplug_work); @@ -1296,7 +640,7 @@ static void blk_unplug_work(struct work_struct *work) q->unplug_fn(q); } -static void blk_unplug_timeout(unsigned long data) +void blk_unplug_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; @@ -1961,393 +1305,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq, EXPORT_SYMBOL(blk_insert_request); -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - -int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - if (!rq->bio) - blk_rq_bio_prep(q, rq, bio); - else if (!ll_back_merge_fn(q, rq, bio)) - return -EINVAL; - else { - rq->biotail->bi_next = bio; - rq->biotail = bio; - - rq->data_len += bio->bi_size; - } - return 0; -} -EXPORT_SYMBOL(blk_rq_append_bio); - -static int __blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned int len) -{ - unsigned long uaddr; - struct bio *bio, *orig_bio; - int reading, ret; - - reading = rq_data_dir(rq) == READ; - - /* - * if alignment requirement is satisfied, map in user pages for - * direct dma. else, set up kernel bounce buffers - */ - uaddr = (unsigned long) ubuf; - if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) - bio = bio_map_user(q, NULL, uaddr, len, reading); - else - bio = bio_copy_user(q, uaddr, len, reading); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - orig_bio = bio; - blk_queue_bounce(q, &bio); - - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - bio_get(bio); - - ret = blk_rq_append_bio(q, rq, bio); - if (!ret) - return bio->bi_size; - - /* if it was boucned we must call the end io function */ - bio_endio(bio, 0); - __blk_rq_unmap_user(orig_bio); - bio_put(bio); - return ret; -} - -/** - * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request structure to fill - * @ubuf: the user buffer - * @len: length of user data - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user(struct request_queue *q, struct request *rq, - void __user *ubuf, unsigned long len) -{ - unsigned long bytes_read = 0; - struct bio *bio = NULL; - int ret; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !ubuf) - return -EINVAL; - - while (bytes_read != len) { - unsigned long map_len, end, start; - - map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); - end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) - >> PAGE_SHIFT; - start = (unsigned long)ubuf >> PAGE_SHIFT; - - /* - * A bad offset could cause us to require BIO_MAX_PAGES + 1 - * pages. If this happens we just lower the requested - * mapping len by a page so that we can fit - */ - if (end - start > BIO_MAX_PAGES) - map_len -= PAGE_SIZE; - - ret = __blk_rq_map_user(q, rq, ubuf, map_len); - if (ret < 0) - goto unmap_rq; - if (!bio) - bio = rq->bio; - bytes_read += ret; - ubuf += ret; - } - - rq->buffer = rq->data = NULL; - return 0; -unmap_rq: - blk_rq_unmap_user(bio); - return ret; -} - -EXPORT_SYMBOL(blk_rq_map_user); - -/** - * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to map data to - * @iov: pointer to the iovec - * @iov_count: number of elements in the iovec - * @len: I/O byte count - * - * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise - * a kernel bounce buffer is used. - * - * A matching blk_rq_unmap_user() must be issued at the end of io, while - * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. - */ -int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct sg_iovec *iov, int iov_count, unsigned int len) -{ - struct bio *bio; - - if (!iov || iov_count <= 0) - return -EINVAL; - - /* we don't allow misaligned data like bio_map_user() does. If the - * user is using sg, they're expected to know the alignment constraints - * and respect them accordingly */ - bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (bio->bi_size != len) { - bio_endio(bio, 0); - bio_unmap_user(bio); - return -EINVAL; - } - - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_user_iov); - -/** - * blk_rq_unmap_user - unmap a request with user data - * @bio: start of bio list - * - * Description: - * Unmap a rq previously mapped by blk_rq_map_user(). The caller must - * supply the original rq->bio from the blk_rq_map_user() return, since - * the io completion may have changed rq->bio. - */ -int blk_rq_unmap_user(struct bio *bio) -{ - struct bio *mapped_bio; - int ret = 0, ret2; - - while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; - - mapped_bio = bio; - bio = bio->bi_next; - bio_put(mapped_bio); - } - - return ret; -} - -EXPORT_SYMBOL(blk_rq_unmap_user); - -/** - * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage - * @q: request queue where request should be inserted - * @rq: request to fill - * @kbuf: the kernel buffer - * @len: length of user data - * @gfp_mask: memory allocation flags - */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) -{ - struct bio *bio; - - if (len > (q->max_hw_sectors << 9)) - return -EINVAL; - if (!len || !kbuf) - return -EINVAL; - - bio = bio_map_kern(q, kbuf, len, gfp_mask); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (rq_data_dir(rq) == WRITE) - bio->bi_rw |= (1 << BIO_RW); - - blk_rq_bio_prep(q, rq, bio); - blk_queue_bounce(q, &rq->bio); - rq->buffer = rq->data = NULL; - return 0; -} - -EXPORT_SYMBOL(blk_rq_map_kern); - -/** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler - * - * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution. Don't wait for completion. - */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) -{ - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - - rq->rq_disk = bd_disk; - rq->cmd_flags |= REQ_NOMERGE; - rq->end_io = done; - WARN_ON(irqs_disabled()); - spin_lock_irq(q->queue_lock); - __elv_add_request(q, rq, where, 1); - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); - -/** - * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * - * Description: - * Insert a fully prepared request at the back of the io scheduler queue - * for execution and wait for completion. - */ -int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - char sense[SCSI_SENSE_BUFFERSIZE]; - int err = 0; - - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - - if (!rq->sense) { - memset(sense, 0, sizeof(sense)); - rq->sense = sense; - rq->sense_len = 0; - } - - rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); - - if (rq->errors) - err = -EIO; - - return err; -} - -EXPORT_SYMBOL(blk_execute_rq); - -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - - complete(bio->bi_private); -} - -/** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @error_sector: error sector - * - * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. Caller must run wait_for_completion() on its own. - */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; - bio->bi_bdev = bdev; - submit_bio(1 << BIO_RW_BARRIER, bio); - - wait_for_completion(&wait); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from rq->sector. - */ - if (error_sector) - *error_sector = bio->bi_sector; - - ret = 0; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} - -EXPORT_SYMBOL(blkdev_issue_flush); - static void drive_stat_acct(struct request *rq, int new_io) { int rw = rq_data_dir(rq); @@ -2459,26 +1416,6 @@ void blk_put_request(struct request *req) EXPORT_SYMBOL(blk_put_request); -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end io status of the request - */ -void blk_end_sync_rq(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} -EXPORT_SYMBOL(blk_end_sync_rq); - /* * Has to be called with the request spinlock acquired */ @@ -2557,7 +1494,7 @@ static inline int attempt_front_merge(struct request_queue *q, return 0; } -static void init_request_from_bio(struct request *req, struct bio *bio) +void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -3524,8 +2461,8 @@ int blk_end_request_callback(struct request *rq, int error, int nr_bytes, } EXPORT_SYMBOL_GPL(blk_end_request_callback); -static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, - struct bio *bio) +void blk_rq_bio_prep(struct request_queue *q, struct request *rq, + struct bio *bio) { /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= (bio->bi_rw & 3); @@ -3571,188 +2508,12 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - iocontext_cachep = kmem_cache_create("blkdev_ioc", - sizeof(struct io_context), 0, SLAB_PANIC, NULL); - for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); register_hotcpu_notifier(&blk_cpu_notifier); - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - - return 0; -} - -static void cfq_dtor(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - /* - * We don't have a specific key to lookup with, so use the gang - * lookup to just retrieve the first item stored. The cfq exit - * function will iterate the full tree, so any member will do. - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - if (r > 0) - cic[0]->dtor(ioc); -} - -/* - * IO Context helper functions. put_io_context() returns 1 if there are no - * more users of this io context, 0 otherwise. - */ -int put_io_context(struct io_context *ioc) -{ - if (ioc == NULL) - return 1; - - BUG_ON(atomic_read(&ioc->refcount) == 0); - - if (atomic_dec_and_test(&ioc->refcount)) { - rcu_read_lock(); - if (ioc->aic && ioc->aic->dtor) - ioc->aic->dtor(ioc->aic); - rcu_read_unlock(); - cfq_dtor(ioc); - - kmem_cache_free(iocontext_cachep, ioc); - return 1; - } return 0; } -EXPORT_SYMBOL(put_io_context); - -static void cfq_exit(struct io_context *ioc) -{ - struct cfq_io_context *cic[1]; - int r; - - rcu_read_lock(); - /* - * See comment for cfq_dtor() - */ - r = radix_tree_gang_lookup(&ioc->radix_root, (void **) cic, 0, 1); - rcu_read_unlock(); - - if (r > 0) - cic[0]->exit(ioc); -} - -/* Called by the exitting task */ -void exit_io_context(void) -{ - struct io_context *ioc; - - task_lock(current); - ioc = current->io_context; - current->io_context = NULL; - task_unlock(current); - - if (atomic_dec_and_test(&ioc->nr_tasks)) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); - cfq_exit(ioc); - - put_io_context(ioc); - } -} - -struct io_context *alloc_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret; - - ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - ret->ioc_data = NULL; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * Otherwise, return its existing IO context. - * - * This returned IO context doesn't have a specifically elevated refcount, - * but since the current task itself holds a reference, the context can be - * used in general code, so long as it stays within `current` context. - */ -static struct io_context *current_io_context(gfp_t gfp_flags, int node) -{ - struct task_struct *tsk = current; - struct io_context *ret; - - ret = tsk->io_context; - if (likely(ret)) - return ret; - - ret = alloc_io_context(gfp_flags, node); - if (ret) { - /* make sure set_task_ioprio() sees the settings above */ - smp_wmb(); - tsk->io_context = ret; - } - - return ret; -} - -/* - * If the current task has no IO context then create one and initialise it. - * If it does have a context, take a ref on it. - * - * This is always called in the context of the task which submitted the I/O. - */ -struct io_context *get_io_context(gfp_t gfp_flags, int node) -{ - struct io_context *ret = NULL; - - /* - * Check for unlikely race with exiting task. ioc ref count is - * zero when ioc is being detached. - */ - do { - ret = current_io_context(gfp_flags, node); - if (unlikely(!ret)) - break; - } while (!atomic_inc_not_zero(&ret->refcount)); - - return ret; -} -EXPORT_SYMBOL(get_io_context); - -void copy_io_context(struct io_context **pdst, struct io_context **psrc) -{ - struct io_context *src = *psrc; - struct io_context *dst = *pdst; - - if (src) { - BUG_ON(atomic_read(&src->refcount) == 0); - atomic_inc(&src->refcount); - put_io_context(dst); - *pdst = src; - } -} -EXPORT_SYMBOL(copy_io_context); - -void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) -{ - struct io_context *temp; - temp = *ioc1; - *ioc1 = *ioc2; - *ioc2 = temp; -} -EXPORT_SYMBOL(swap_io_context); |