From 8757ad65d30f009fe0beeb2d70d3cd834cb998f2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 11 Apr 2014 10:37:39 -0400 Subject: NVMe: Update copyright headers Make the copyright dates accurate and remove the final paragraph that includes the address of the FSF. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 4 ---- drivers/block/nvme-scsi.c | 6 +----- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7c64fa756cce..eacd64e36be3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 2c3f5be06da1..da3b252dea6f 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1,6 +1,6 @@ /* * NVM Express device driver - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2014, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ /* -- cgit v1.2.3 From 27e8166c31656f7780e682eaf6bc9c3b8dd03253 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 11 Apr 2014 11:58:45 -0400 Subject: NVMe: Improve error messages Help people diagnose what is going wrong at initialisation time by printing out which command has gone wrong and what the device returned. Also fix the error message printed while waiting for reset. Signed-off-by: Matthew Wilcox Reviewed-by: Keith Busch --- drivers/block/nvme-core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index eacd64e36be3..074e9829bb08 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1354,7 +1354,8 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); return -ENODEV; } } @@ -2058,8 +2059,13 @@ static int set_queue_count(struct nvme_dev *dev, int count) status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); - if (status) - return status < 0 ? -EIO : -EBUSY; + if (status < 0) + return status; + if (status > 0) { + dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", + status); + return -EBUSY; + } return min(result & 0xffff, result >> 16) + 1; } @@ -2180,6 +2186,7 @@ static int nvme_dev_add(struct nvme_dev *dev) res = nvme_identify(dev, 0, 1, dma_addr); if (res) { + dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); res = -EIO; goto out; } -- cgit v1.2.3 From 94bbac4052eb93219ca0aa370ca741486b25fb98 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 24 Apr 2014 18:53:50 -0600 Subject: NVMe: Protect against badly formatted CQEs If a misbehaving device posts a CQE with a command id < depth but for one that was never allocated, the command info will have a callback function set to NULL and we don't want to try invoking that. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 074e9829bb08..b9f07f81ea5d 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -243,8 +243,9 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, void *ctx; struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth) { - *fn = special_completion; + if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { + if (fn) + *fn = special_completion; return CMD_CTX_INVALID; } if (fn) -- cgit v1.2.3 From 3291fa57cb1b004c1a4823beb28b5cc72555f1a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 28 Apr 2014 12:30:52 -0600 Subject: NVMe: Add tracepoints Adding tracepoints for bio_complete and block_split into nvme to help with gathering IO info using blktrace and blkparse. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- block/blk-core.c | 1 + drivers/block/nvme-core.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/block/blk-core.c b/block/blk-core.c index a0e3096c4bb5..c488b55dddd7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -43,6 +43,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b9f07f81ea5d..025dd4cad4a6 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -42,6 +42,8 @@ #include #include +#include + #define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -411,6 +413,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_iod *iod = ctx; struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; + int error = 0; if (unlikely(status)) { if (!(status & NVME_SC_DNR || @@ -423,6 +426,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, wake_up(&nvmeq->sq_full); return; } + error = -EIO; } if (iod->nents) { dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, @@ -430,10 +434,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, nvme_end_io_acct(bio, iod->start_time); } nvme_free_iod(nvmeq->dev, iod); - if (status) - bio_endio(bio, -EIO); - else - bio_endio(bio, 0); + + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); + bio_endio(bio, error); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -522,6 +525,8 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, if (!split) return -ENOMEM; + trace_block_split(bdev_get_queue(bio->bi_bdev), bio, + split->bi_iter.bi_sector); bio_chain(split, bio); if (!waitqueue_active(&nvmeq->sq_full)) -- cgit v1.2.3 From a7d2ce2832d84e0182585f63bf96ca7323b3aee7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Apr 2014 11:41:28 -0600 Subject: NVMe: Configure support for block flush This configures an nvme request_queue as flush capable if the device has a volatile write cache present. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 3 +++ include/linux/nvme.h | 1 + include/uapi/linux/nvme.h | 1 + 3 files changed, 5 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 025dd4cad4a6..e7c4fdb6a651 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1897,6 +1897,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); disk->major = nvme_major; disk->first_minor = 0; @@ -2201,6 +2203,7 @@ static int nvme_dev_add(struct nvme_dev *dev) nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; + dev->vwc = ctrl->vwc; memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index cfd084cab22b..6266373d3147 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -99,6 +99,7 @@ struct nvme_dev { u32 stripe_size; u16 oncs; u16 abort_limit; + u8 vwc; u8 initialized; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index ad9014e49693..f090336d5bad 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -73,6 +73,7 @@ enum { NVME_CTRL_ONCS_COMPARE = 1 << 0, NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_VWC_PRESENT = 1 << 0, }; struct nvme_lbaf { -- cgit v1.2.3 From 53562be74bd06bbe74d2acf3caca5398f8eeb160 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Apr 2014 11:41:29 -0600 Subject: NVMe: Flush with data support It is possible a filesystem may send a flush flagged bio with write data. There is no such composite NVMe command, so the driver sends flush and write separately. The device is allowed to execute these commands in any order, so it was possible the driver ends the bio after the write completes, but while the flush is still active. We don't want to let a filesystem believe flush succeeded before it really has; this could cause data corruption on a power loss between these events. To fix, this patch splits the flush and write into chained bios. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 44 ++++++++++++++++++++++++-------------------- include/linux/nvme.h | 1 - 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e7c4fdb6a651..cd8a8bc711cc 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -197,16 +197,13 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) +#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_FLUSH) - return; if (ctx == CMD_CTX_ABORT) { ++nvmeq->dev->abort_limit; return; @@ -629,16 +626,6 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) -{ - int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, - special_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - return nvme_submit_flush(nvmeq, ns, cmdid); -} - static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) { struct bio *bio = iod->private; @@ -654,7 +641,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) if (bio->bi_rw & REQ_DISCARD) return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if ((bio->bi_rw & REQ_FLUSH) && !iod->nents) + if (bio->bi_rw & REQ_FLUSH) return nvme_submit_flush(nvmeq, ns, cmdid); control = 0; @@ -688,6 +675,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } +static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) +{ + struct bio *split = bio_clone(bio, GFP_ATOMIC); + if (!split) + return -ENOMEM; + + split->bi_iter.bi_size = 0; + split->bi_phys_segments = 0; + bio->bi_rw &= ~REQ_FLUSH; + bio_chain(split, bio); + + if (!waitqueue_active(&nvmeq->sq_full)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, split); + bio_list_add(&nvmeq->sq_cong, bio); + wake_up_process(nvme_thread); + + return 0; +} + /* * Called with local interrupts disabled and the q_lock held. May not sleep. */ @@ -698,11 +705,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, int psegs = bio_phys_segments(ns->queue, bio); int result; - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } + if ((bio->bi_rw & REQ_FLUSH) && psegs) + return nvme_split_flush_data(nvmeq, bio); iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); if (!iod) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 6266373d3147..1813cfdb7e80 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -156,7 +156,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod); int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *); -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, u32 *result); int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, -- cgit v1.2.3 From 21bd78bcf4208e84deab0d34f9d4e034d0580d0c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 9 May 2014 22:42:26 -0400 Subject: NVMe: Enable BUILD_BUG_ON checks Since _nvme_check_size() wasn't being called from anywhere, the compiler was optimising it away ... along with all the link-time build failures that would result if any of the structures were the wrong size. Call it from nvme_exit() for no particular reason. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cd8a8bc711cc..b82155888845 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2912,6 +2912,7 @@ static void __exit nvme_exit(void) unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); + _nvme_check_size(); } MODULE_AUTHOR("Matthew Wilcox "); -- cgit v1.2.3 From 4131f2fcdc5853d995044d4ed995a25ee4eb5ab2 Mon Sep 17 00:00:00 2001 From: Indraneel Mukherjee Date: Thu, 29 May 2014 12:02:03 +0530 Subject: NVMe: Fix the buffer size passed in GetLogPage(CDW10.NUMD) In GetLogPage the buffer size passed to device is a 0's based value. Signed-off-by: Indraneel M Reported-by: Shiro Itou Reviewed-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index da3b252dea6f..7ec61c4a817c 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1014,8 +1014,8 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, c.common.opcode = nvme_admin_get_log_page; c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_GET_SMART_LOG_PAGE); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c = LOG_TEMP_UNKNOWN; @@ -1082,8 +1082,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_admin_get_log_page; c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_GET_SMART_LOG_PAGE); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c_cur = LOG_TEMP_UNKNOWN; -- cgit v1.2.3 From 6808c5fb7fc1574c7608a38c9819f1639d89c3d0 Mon Sep 17 00:00:00 2001 From: Santosh Y Date: Thu, 29 May 2014 10:01:52 +0530 Subject: NVMe: Prevent possible NULL pointer dereference kmalloc() used by the nvme_alloc_iod() to allocate memory for 'iod' can fail. So check the return value. Signed-off-by: Santosh Y Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index b82155888845..872d8e42d008 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1488,7 +1488,11 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, goto put_pages; } + err = -ENOMEM; iod = nvme_alloc_iod(count, length, GFP_KERNEL); + if (!iod) + goto put_pages; + sg = iod->sg; sg_init_table(sg, count); for (i = 0; i < count; i++) { @@ -1501,7 +1505,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, sg_mark_end(&sg[i - 1]); iod->nents = count; - err = -ENOMEM; nents = dma_map_sg(&dev->pci_dev->dev, sg, count, write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (!nents) -- cgit v1.2.3 From 61e4ce086df0a64a555880089e3b782517c828c0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 11:42:01 -0600 Subject: NVMe: Make iod bio timeout a parameter This was originally set to 4 times the IO timeout, but that was when the IO timeout was 5 seconds instead of 30. 20 seconds for total time to failure seemed more reasonable than 2 minutes for most, but other users have requested to make this a module parameter instead. Signed-off-by: Keith Busch [renamed the module parameter to retry_time] [made retry_time static] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 872d8e42d008..1a911067061c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -48,12 +48,16 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (60 * HZ) -#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT) +#define IOD_TIMEOUT (retry_time * HZ) unsigned char io_timeout = 30; module_param(io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +static unsigned char retry_time = 30; +module_param(retry_time, byte, 0644); +MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); + static int nvme_major; module_param(nvme_major, int, 0); -- cgit v1.2.3 From 9d43cf646eab948db81913379dacb1dcecacb1eb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 11:42:02 -0600 Subject: NVMe: Make admin timeout a module parameter Signed-off-by: Keith Busch [made admin_timeout static] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 1a911067061c..12c57eb7c915 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -44,11 +44,15 @@ #include -#define NVME_Q_DEPTH 1024 +#define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (60 * HZ) -#define IOD_TIMEOUT (retry_time * HZ) +#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define IOD_TIMEOUT (retry_time * HZ) + +static unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); unsigned char io_timeout = 30; module_param(io_timeout, byte, 0644); -- cgit v1.2.3 From de672b9748f78dcbc663e12ea44cb24dc287baf0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 3 Jun 2014 17:06:26 -0400 Subject: NVMe: Delete NVME_GET_FEAT_TEMP_THRESH This define isn't used, and any code that wanted to use it should use NVME_FEAT_TEMP_THRESH instead. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 7ec61c4a817c..13403ee716da 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -240,7 +240,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ /* NVMe Namespace and Command Defines */ #define NVME_GET_SMART_LOG_PAGE 0x02 -#define NVME_GET_FEAT_TEMP_THRESH 0x04 #define BYTES_TO_DWORDS 4 #define NVME_MAX_FIRMWARE_SLOT 7 -- cgit v1.2.3 From a51afb54339c5e9ee72df66ae0f2ac5aacfed365 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 May 2014 10:32:46 -0600 Subject: NVMe: Fix nvme get/put queue semantics The routines to get and lock nvme queues required the caller to "put" or "unlock" them even if getting one returned NULL. This patch fixes that. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 12c57eb7c915..29a3e85873b5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -285,9 +285,17 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) { + struct nvme_queue *nvmeq; unsigned queue_id = get_cpu_var(*dev->io_queue); + rcu_read_lock(); - return rcu_dereference(dev->queues[queue_id]); + nvmeq = rcu_dereference(dev->queues[queue_id]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + put_cpu_var(*dev->io_queue); + return NULL; } static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -299,8 +307,15 @@ static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) __acquires(RCU) { + struct nvme_queue *nvmeq; + rcu_read_lock(); - return rcu_dereference(dev->queues[q_idx]); + nvmeq = rcu_dereference(dev->queues[q_idx]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + return NULL; } static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -809,7 +824,6 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) int result = -EBUSY; if (!nvmeq) { - put_nvmeq(NULL); bio_endio(bio, -EIO); return; } @@ -884,10 +898,8 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, struct nvme_queue *nvmeq; nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) { - unlock_nvmeq(nvmeq); + if (!nvmeq) return -ENODEV; - } cmdinfo.task = current; cmdinfo.status = -EINTR; @@ -912,9 +924,10 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, if (cmdinfo.status == -EINTR) { nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) + if (nvmeq) { nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); + unlock_nvmeq(nvmeq); + } return -EINTR; } -- cgit v1.2.3 From b4e75cbf1364c4bbce3599c3279892a55b6ede07 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Fri, 9 May 2014 13:27:07 -0700 Subject: NVMe: Adhere to request queue block accounting enable/disable Recently, a new sysfs control "iostats" was added to selectively enable or disable io statistics collection for request queues. This patch hooks that control. IO statistics collection is rather expensive on large, multi-node machines with drives pushing millions of iops. Having the ability to disable collection if not needed can improve throughput significantly. As a data point, on a quad E5-4640, I see more than 50% throughput improvement when io statistics accounting is disabled during heavily multi-threaded small block random read benchmarks where device performance is in the million iops+ range. Signed-off-by: Sam Bradshaw Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 29a3e85873b5..bb6ce311ad44 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -406,25 +406,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) static void nvme_start_io_acct(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], + bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void bio_completion(struct nvme_queue *nvmeq, void *ctx, -- cgit v1.2.3 From dedf4b156134e0dedec18ebecda3e74077fa7c92 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 29 Apr 2014 15:52:27 -0600 Subject: NVMe: Use last bytes of f/w rev SCSI Inquiry After skipping right-padded spaces, use the last four bytes of the firmware revision when reporting the Inquiry Product Revision. These are generally more indicative to what is running. Signed-off-by: Keith Busch Acked-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 13403ee716da..4fc25b980535 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -681,6 +681,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 resp_data_format = 0x02; u8 protect; u8 cmdque = 0x01 << 1; + u8 fw_offset = sizeof(dev->firmware_rev); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); @@ -716,7 +717,11 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ strncpy(&inq_response[8], "NVMe ", 8); strncpy(&inq_response[16], dev->model, 16); - strncpy(&inq_response[32], dev->firmware_rev, 4); + + while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + fw_offset--; + fw_offset -= 4; + strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -- cgit v1.2.3 From bd67608a6127c994e897c49cc4f72d9095925301 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 3 Jun 2014 23:04:30 -0400 Subject: NVMe: Rename io_timeout to nvme_io_timeout It's positively immoral to have a global variable called 'io_timeout'. Keep the module parameter called io_timeout, though. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 4 ++-- include/linux/nvme.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index bb6ce311ad44..2af079e571fc 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -54,8 +54,8 @@ static unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); -unsigned char io_timeout = 30; -module_param(io_timeout, byte, 0644); +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); static unsigned char retry_time = 30; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1813cfdb7e80..8541dd920bb7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -62,8 +62,8 @@ enum { #define NVME_VS(major, minor) (major << 16 | minor) -extern unsigned char io_timeout; -#define NVME_IO_TIMEOUT (io_timeout * HZ) +extern unsigned char nvme_io_timeout; +#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) /* * Represents an NVM Express device. Each nvme_dev is a PCI function. -- cgit v1.2.3 From f3db22feb5de6b98b7bae924c2d4b6c8d65bedae Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Jun 2014 11:51:35 -0600 Subject: NVMe: Fix hot cpu notification dead lock There is a potential dead lock if a cpu event occurs during nvme probe since it registered with hot cpu notification. This fixes the race by having the module register with notification outside of probe rather than have each device register. The actual work is done in a scheduled work queue instead of in the notifier since assigning IO queues has the potential to block if the driver creates additional queues. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 35 +++++++++++++++++++++++++---------- include/linux/nvme.h | 2 +- 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2af079e571fc..e0ac1210fe31 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -73,6 +73,7 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; +static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); @@ -2115,14 +2116,25 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } +static void nvme_cpu_workfn(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); + if (dev->initialized) + nvme_assign_io_queues(dev); +} + static int nvme_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); + struct nvme_dev *dev; + switch (action) { case CPU_ONLINE: case CPU_DEAD: - nvme_assign_io_queues(dev); + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) + schedule_work(&dev->cpu_work); + spin_unlock(&dev_list_lock); break; } return NOTIFY_OK; @@ -2191,11 +2203,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_free_queues(dev, nr_io_queues + 1); nvme_assign_io_queues(dev); - dev->nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&dev->nb); - if (result) - goto free_queues; - return 0; free_queues: @@ -2495,8 +2502,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; dev->initialized = 0; - unregister_hotcpu_notifier(&dev->nb); - nvme_dev_list_remove(dev); if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { @@ -2767,6 +2772,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); + INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2836,6 +2842,7 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); + flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); nvme_dev_remove(dev); nvme_dev_shutdown(dev); @@ -2923,11 +2930,18 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - result = pci_register_driver(&nvme_driver); + nvme_nb.notifier_call = &nvme_cpu_notify; + result = register_hotcpu_notifier(&nvme_nb); if (result) goto unregister_blkdev; + + result = pci_register_driver(&nvme_driver); + if (result) + goto unregister_hotcpu; return 0; + unregister_hotcpu: + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -2938,6 +2952,7 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8541dd920bb7..2bf403195c09 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -90,7 +90,7 @@ struct nvme_dev { struct miscdevice miscdev; work_func_t reset_workfn; struct work_struct reset_work; - struct notifier_block nb; + struct work_struct cpu_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From ef351b97dedaa7a6e257ed4f554718e384d8786b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 13 Jun 2014 10:54:21 -0400 Subject: NVMe: Use Log Page constants in SCSI emulation The nvme-scsi file defined its own Log Page constant. Use the newly-defined one from the header file instead. Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 4fc25b980535..24308ae8abf5 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -239,7 +239,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define READ_CAP_16_RESP_SIZE 32 /* NVMe Namespace and Command Defines */ -#define NVME_GET_SMART_LOG_PAGE 0x02 #define BYTES_TO_DWORDS 4 #define NVME_MAX_FIRMWARE_SLOT 7 @@ -1019,7 +1018,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_GET_SMART_LOG_PAGE); + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c = LOG_TEMP_UNKNOWN; @@ -1087,7 +1086,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_GET_SMART_LOG_PAGE); + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c_cur = LOG_TEMP_UNKNOWN; -- cgit v1.2.3 From b8e080847a7292347a3eee76264f77e4abcb61f7 Mon Sep 17 00:00:00 2001 From: Dan McLeran Date: Fri, 6 Jun 2014 08:27:27 -0600 Subject: NVMe: Fix START_STOP_UNIT Scsi->NVMe translation. This patch contains several fixes for Scsi START_STOP_UNIT. The previous code did not account for signed vs. unsigned arithmetic which resulted in an invalid lowest power state caculation when the device only supports 1 power state. The code for Power Condition == 2 (Idle) was not following the spec. The spec calls for setting the device to specific power states, depending upon Power Condition Modifier, without accounting for the number of power states supported by the device. The code for Power Condition == 3 (Standby) was using a hard-coded '0' which is replaced with the macro POWER_STATE_0. Signed-off-by: Dan McLeran Reviewed-by: Vishal Verma Signed-off-by: Matthew Wilcox --- drivers/block/nvme-scsi.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 24308ae8abf5..a4cd6d691c63 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1476,7 +1476,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } id_ctrl = mem; - lowest_pow_st = id_ctrl->npss - 1; + lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); switch (pc) { case NVME_POWER_STATE_START_VALID: @@ -1493,20 +1493,19 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, break; case NVME_POWER_STATE_IDLE: /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ - /* min of desired state and (lps-1) because lps is STOP */ if (pcmod == 0x0) - ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_1; else if (pcmod == 0x1) - ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_2; else if (pcmod == 0x2) - ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_3; break; case NVME_POWER_STATE_STANDBY: /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ if (pcmod == 0x0) - ps_desired = max(0, (lowest_pow_st - 2)); + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); else if (pcmod == 0x1) - ps_desired = max(0, (lowest_pow_st - 1)); + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); break; case NVME_POWER_STATE_LU_CONTROL: default: -- cgit v1.2.3