From a87835e9ecbd29ddffdd5bae81dc913afc221abd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:11 -0700 Subject: nvme-core: use u16 type for directives In nvme_configure_directives() when calculating number of streams use u16 instead of unsigned type in the min_t() since target variable ctrl->nr_streams is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cd763f31864b..c4ba51dd213f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -555,7 +555,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; -- cgit v1.2.3 From d4047cf99421d434660a5a0c61ac3e83b4ad0dad Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:12 -0700 Subject: nvme-core: use u16 type for ctrl->sqsize In nvme_init_identify() when calculating submission queue size use u16 instead of int type in the min_t() since target variable ctrl->sqsize is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4ba51dd213f..c1de05646a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2873,7 +2873,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); -- cgit v1.2.3 From 61f3b89630973037f67d8e25e5d26e80a51a7b37 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 17 Jun 2020 10:05:13 +0200 Subject: nvme-pci: use unsigned for io queue depth The NVMe PCIe declares module parameter io_queue_depth as int. Change this to u16 as queue depth can never be negative. Now to reflect this update module parameter getter function from param_get_int() -> param_get_uint() and respective setter function with type of n changed from int to u16 with param_set_int() to param_set_ushort(). Finally update struct nvme_dev q_depth member to u16 and use u16 in min_t() when calculating dev->q_depth in the nvme_pci_enable() (since q_depth is now u16) and use unsigned int instead of int when calculating dev->tagset.queue_depth as target variable tagset->queue_depth is of type unsigned int in nvme_dev_add(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74a2e2e00794..7ed1e2dfbee6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -61,10 +61,10 @@ MODULE_PARM_DESC(sgl_threshold, static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, - .get = param_get_int, + .get = param_get_uint, }; -static int io_queue_depth = 1024; +static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); @@ -115,7 +115,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - int q_depth; + u16 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -151,13 +151,14 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int n = 0, ret; + int ret; + u16 n; - ret = kstrtoint(val, 10, &n); + ret = kstrtou16(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_int(val, kp); + return param_set_ushort(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -2261,8 +2262,8 @@ static void nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; @@ -2321,7 +2322,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); -- cgit v1.2.3 From 9dc54a0d15665088f4c4ee30df5d2a94f03be3fa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:14 -0700 Subject: nvme-pci: code cleanup for nvme_alloc_host_mem() Although use of for loop is preferred it is not a common practice to have 80 char long for loop initialization and comparison section. Use temp variables for calculating values and replace them in the for loop with size of all variables to set to u64 since preferred variable is declared as u64. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7ed1e2dfbee6..cd14b1a0ef90 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1933,12 +1933,12 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - u32 chunk_size; + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - chunk_size /= 2) { + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; -- cgit v1.2.3 From ad509996432e54284d35f2e42b0f78273cbef27d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:02 -0700 Subject: nvme-pci: remove the empty line at the beginning of nvme_should_reset() Just cleanup by removing the empty line. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cd14b1a0ef90..b3538141ec11 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1155,7 +1155,6 @@ static void abort_endio(struct request *req, blk_status_t error) static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) { - /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset. */ -- cgit v1.2.3 From b261b61c9e03ddf21464cda777a2457113b118be Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:01 -0700 Subject: nvmet-loop: remove unused 'target_ctrl' in nvme_loop_ctrl This field is never used since the introduction of nvme loopback by commit 3a85a5de29ea ("nvme-loop: add a NVMe loopback host driver"). Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 8a0d4fe7bc18..f2c80a51985f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -36,7 +36,6 @@ struct nvme_loop_ctrl { struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; - struct nvmet_ctrl *target_ctrl; struct nvmet_port *port; }; -- cgit v1.2.3 From 4e102559725683a4bf7240db8ed12b6b64cfccbf Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:45 +0300 Subject: nvmet-tcp: remove has_keyed_sgls initialization Since the nvmet_tcp_ops is static, there is no need to initialize values to zero. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index de9217cfd22d..1ce22b698f4d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1717,7 +1717,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, .msdbd = 1, - .has_keyed_sgls = 0, .add_port = nvmet_tcp_add_port, .remove_port = nvmet_tcp_remove_port, .queue_response = nvmet_tcp_queue_response, -- cgit v1.2.3 From 6fa350f7145677cf6d0b86eff33eb2e3e246770c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:46 +0300 Subject: nvmet: introduce flags member in nvmet_fabrics_ops Replace has_keyed_sgls and metadata_support booleans with a flags member that will be used for adding more features in the future. Signed-off-by: Max Gurtovoy Reviewed-by: Himanshu Madhani Reviewed-by: Israel Rukshin Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 5 +++-- drivers/nvme/target/rdma.c | 3 +-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1db8c0498668..95bb3bc4e335 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -427,7 +427,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awupf = 0; id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6816507fba58..9cdc39c8b729 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -336,7 +336,7 @@ int nvmet_enable_port(struct nvmet_port *port) * If the user requested PI support and the transport isn't pi capable, * don't enable the port. */ - if (port->pi_enable && !ops->metadata_support) { + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { pr_err("T10-PI is not supported by transport type %d\n", port->disc_addr.trtype); ret = -EINVAL; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 40cf0b6e6c9d..f40c05c33c3a 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 809691291e73..6f8bd6a93575 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -286,8 +286,9 @@ struct nvmet_fabrics_ops { struct module *owner; unsigned int type; unsigned int msdbd; - bool has_keyed_sgls : 1; - bool metadata_support : 1; + unsigned int flags; +#define NVMF_KEYED_SGLS (1 << 0) +#define NVMF_METADATA_SUPPORTED (1 << 1) void (*queue_response)(struct nvmet_req *req); int (*add_port)(struct nvmet_port *port); void (*remove_port)(struct nvmet_port *port); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76ea23a2c2be..6731e0349480 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, - .has_keyed_sgls = 1, - .metadata_support = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, -- cgit v1.2.3 From a0f0dbaa6986d6777a4a6835ec91616d5c75ac25 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 11 Jun 2020 18:16:59 -0700 Subject: nvmet: use unsigned type for u64 In function nvmet_subsys_atte_version_show() which uses the NVME_XXX() macros related to version (of type u64) get rid of the int type cast when printing subsys version and use appropriate format specifier for u64. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 419e0d4ce79b..cdec47de89ed 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -862,14 +862,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); if (NVME_TERTIARY(subsys->ver)) - return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver), - (int)NVME_TERTIARY(subsys->ver)); - - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, -- cgit v1.2.3 From ca8f4beebfb4979acb664e02365ef6e662ef95b0 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 25 May 2020 21:21:18 -0700 Subject: nvme-fcloop: verify wwnn and wwpn format The nvme host and target verify the wwnn and wwpn format via nvme_fc_parse_traddr(). For instance, it is required that the length of wwnn to be either 21 ("nn-0x") or 19 (nn-). Add this verification to nvme-fcloop so that the input should always be in hex and the length of input should always be 18. Otherwise, the user may use e.g. 0x2 to create fcloop local port, while 0x0000000000000002 is required for nvme host and target. This makes the requirement of format confusing. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 2ff1d1334a03..c97e60b71bbc 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -43,6 +43,17 @@ static const match_table_t opt_tokens = { { NVMF_OPT_ERR, NULL } }; +static int fcloop_verify_addr(substring_t *s) +{ + size_t blen = s->to - s->from + 1; + + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 || + strncmp(s->from, "0x", 2)) + return -EINVAL; + + return 0; +} + static int fcloop_parse_options(struct fcloop_ctrl_options *opts, const char *buf) @@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->mask |= token; switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->wwnn = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->fcaddr = token; break; case NVMF_OPT_LPWWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->lpwwnn = token64; break; case NVMF_OPT_LPWWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, token = match_token(p, opt_tokens, args); switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } *nname = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } -- cgit v1.2.3 From 15ec928a65e0528ef4999e2947b4802b772f0891 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:22 -0700 Subject: nvme-tcp: have queue prod/cons send list become a llist The queue processing will splice to a queue local list, this should alleviate some contention on the send_list lock, but also prepares us to the next patch where we look on these lists for network stack flag optimization. Remove queue lock as its not used anymore. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich [hch: simplified a loop] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7006aca89456..478868572c81 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -46,6 +46,7 @@ struct nvme_tcp_request { u32 pdu_sent; u16 ttag; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -75,8 +76,8 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; /* recv state */ @@ -266,10 +267,8 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, struct nvme_tcp_queue *queue = req->queue; bool empty; - spin_lock(&queue->lock); - empty = list_empty(&queue->send_list) && !queue->request; - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; /* * if we're the first on the send_list and we can try to send @@ -285,18 +284,33 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, } } +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + static inline struct nvme_tcp_request * nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del(&req->entry); return req; } @@ -1344,8 +1358,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int ret, rcv_pdu_size; queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; -- cgit v1.2.3 From 86f0348ace1510d7ac25124b096fb88a6ab45270 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:23 -0700 Subject: nvme-tcp: leverage request plugging blk-mq request plugging can improve the execution of our pipeline. When we queue a request we actually trigger our I/O worker thread yielding a context switch by definition. However if we know that there are more requests in the pipe that are coming, we are better off not trigger our I/O worker and only do that for the last request in the batch (bd->last). By having it, we improve efficiency by amortizing context switches over a batch of requests. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 478868572c81..2d3962c164a4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -262,7 +262,7 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync) + bool sync, bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -279,7 +279,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, sync && empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_try_send(queue); mutex_unlock(&queue->send_mutex); - } else { + } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } } @@ -610,7 +610,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req, false); + nvme_tcp_queue_request(req, false, true); return 0; } @@ -2120,7 +2120,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true); + nvme_tcp_queue_request(&ctrl->async_req, true, true); } static enum blk_eh_timer_return @@ -2232,6 +2232,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2251,7 +2259,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req, true); + nvme_tcp_queue_request(req, true, bd->last); return BLK_STS_OK; } @@ -2319,6 +2327,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, -- cgit v1.2.3 From 122e5b9f3d370ae11e1502d14ff5c7ea9b144a76 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:24 -0700 Subject: nvme-tcp: optimize network stack with setting msg flags according to batch size If we have a long list of request to send, signal the network stack that more is coming (MSG_MORE). If we have nothing else, signal MSG_EOR. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2d3962c164a4..b2e73e19ef01 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -79,6 +79,7 @@ struct nvme_tcp_queue { struct mutex send_mutex; struct llist_head req_list; struct list_head send_list; + bool more_requests; /* recv state */ void *pdu; @@ -277,7 +278,9 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, */ if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { + queue->more_requests = !last; nvme_tcp_try_send(queue); + queue->more_requests = false; mutex_unlock(&queue->send_mutex); } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); @@ -877,6 +880,12 @@ done: read_unlock(&sk->sk_callback_lock); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list) || queue->more_requests; +} + static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) { queue->request = NULL; @@ -898,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) bool last = nvme_tcp_pdu_last_send(req, len); int ret, flags = MSG_DONTWAIT; - if (last && !queue->data_digest) + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) flags |= MSG_EOR; else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; @@ -945,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) int flags = MSG_DONTWAIT; int ret; - if (inline_data) + if (inline_data || nvme_tcp_queue_more(queue)) flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; else flags |= MSG_EOR; @@ -1010,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = &req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; -- cgit v1.2.3 From b8a12e93570d8aa7fbfe2b8909eee8fd0f778afd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 12:27:16 -0700 Subject: nvmet-tcp: simplify nvmet_process_resp_list We can make it shorter and simpler without some redundant checks. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1ce22b698f4d..9eda91162fe4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) { struct llist_node *node; + struct nvmet_tcp_cmd *cmd; - node = llist_del_all(&queue->resp_list); - if (!node) - return; - - while (node) { - struct nvmet_tcp_cmd *cmd = llist_entry(node, - struct nvmet_tcp_cmd, lentry); - + for (node = llist_del_all(&queue->resp_list); node; node = node->next) { + cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); list_add(&cmd->entry, &queue->resp_send_list); - node = node->next; queue->send_list_len++; } } -- cgit v1.2.3 From f5af577d553103115579b5e404070dfab4d00332 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Jun 2020 14:49:58 +0800 Subject: nvme: use USEC_PER_SEC instead of magic numbers Use USEC_PER_SEC instead of magic numbers to make code more readable. Reviewed-by: Sagi Grimberg Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c1de05646a02..96898040a6d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2947,7 +2947,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); -- cgit v1.2.3 From 71010c30945425203da8d069a10fa45a05a00f96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 29 Jun 2020 12:06:39 -0700 Subject: nvme: implement multiple I/O Command Set support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements support for multiple I/O Command Sets. NVMe TP 4056 introduces a method to enumerate multiple command sets per namespace. If the command set is exposed, this method for enumeration will be used instead of the traditional method that uses the CC.CSS register command set register for command set identification. For namespaces where the Command Set Identifier is not supported or recognized, the specific namespace will not be created. Reviewed-by: Javier González Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Matias Bjørling Reviewed-by: Daniel Wagner Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 53 +++++++++++++++++++++++++++++++++++++++--------- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 19 +++++++++++++++-- 3 files changed, 61 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 96898040a6d5..892291dbee64 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; c.identify.opcode = nvme_admin_identify; c.identify.nsid = cpu_to_le32(nsid); @@ -1125,7 +1138,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * device just because of a temporal retry-able error (such * as path of transport errors). */ - if (status > 0 && (status & NVME_SC_DNR)) + if (status > 0 && (status & NVME_SC_DNR) && !nvme_multi_css(ctrl)) status = 0; goto free_data; } @@ -1136,12 +1149,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1798,7 +1818,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1814,7 +1834,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1936,6 +1957,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + default: + dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -2270,7 +2300,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->page_size = 1 << page_shift; - ctrl->ctrl_config = NVME_CC_CSS_NVM; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b5a2e8b7e0be..5573159f714d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -339,6 +339,7 @@ struct nvme_ns_ids { u8 eui64[8]; u8 nguid[16]; uuid_t uuid; + u8 csi; }; /* diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5ce51ab4c50e..81ffe5247505 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,6 +132,7 @@ enum { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) @@ -162,7 +163,6 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, - NVME_CC_CSS_NVM = 0 << 4, NVME_CC_EN_SHIFT = 0, NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, @@ -170,6 +170,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, @@ -179,6 +182,8 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -374,6 +379,8 @@ enum { NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -383,6 +390,10 @@ enum { NVME_ID_CNS_UUID_LIST = 0x17, }; +enum { + NVME_CSI_NVM = 0, +}; + enum { NVME_DIR_IDENTIFY = 0x00, NVME_DIR_STREAMS = 0x01, @@ -435,11 +446,13 @@ struct nvme_ns_id_desc { #define NVME_NIDT_EUI64_LEN 8 #define NVME_NIDT_NGUID_LEN 16 #define NVME_NIDT_UUID_LEN 16 +#define NVME_NIDT_CSI_LEN 1 enum { NVME_NIDT_EUI64 = 0x01, NVME_NIDT_NGUID = 0x02, NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; struct nvme_smart_log { @@ -972,7 +985,9 @@ struct nvme_identify { __u8 cns; __u8 rsvd3; __le16 ctrlid; - __u32 rsvd11[5]; + __u8 rsvd11[3]; + __u8 csi; + __u32 rsvd12[4]; }; #define NVME_IDENTIFY_DATA_SIZE 4096 -- cgit v1.2.3 From be93e87e780253780df9bb6ecc9bc1199b0d94c3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:40 -0700 Subject: nvme: support for multiple Command Sets Supported and Effects log pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Commands Supported and Effects log page was extended with a CSI field that enables the host to query the log page for each command set supported. Retrieve this log page for each command set that an attached namespace supports, and save a pointer to that log in the namespace head. Reviewed-by: Matias Bjørling Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 79 ++++++++++++++++++++++++++++++++----------- drivers/nvme/host/hwmon.c | 2 +- drivers/nvme/host/lightnvm.c | 4 +-- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 10 +++++- include/linux/nvme.h | 4 ++- 6 files changed, 76 insertions(+), 25 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 892291dbee64..62b2cdc764da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1370,8 +1370,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects = 0; if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", @@ -2851,7 +2851,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2865,27 +2865,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) { + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) +{ + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2918,7 +2946,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -3551,6 +3579,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3891,8 +3926,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -4036,8 +4071,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4174,11 +4209,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4209,6 +4249,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 2e6477ed420f..23ba8bf678ae 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) int ret; ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); return ret <= 0 ? ret : -EIO; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69608755d415..8e562d0f2c30 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, dev_meta_off = dev_meta; ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, - offset); + NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, + dev_meta, len, offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a37a595411e..74bad4e3d377 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -527,7 +527,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) int error; mutex_lock(&ctrl->ana_lock); - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) { dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5573159f714d..fe9424c7097f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -191,6 +191,12 @@ struct nvme_fault_inject { #endif }; +struct nvme_cel { + struct list_head entry; + struct nvme_effects_log log; + u8 csi; +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -257,6 +263,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; + struct list_head cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -359,6 +366,7 @@ struct nvme_ns_head { struct kref ref; bool shared; int instance; + struct nvme_effects_log *effects; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; struct bio_list requeue_list; @@ -561,7 +569,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); extern const struct attribute_group *nvme_ns_id_attr_groups[]; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 81ffe5247505..95cd03e240a1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1101,7 +1101,9 @@ struct nvme_get_log_page_command { }; __le64 lpo; }; - __u32 rsvd14[2]; + __u8 rsvd14[3]; + __u8 csi; + __u32 rsvd15; }; struct nvme_directive_cmd { -- cgit v1.2.3 From 240e6ee272c07a2636dfc7d65f5bbb18377c49e5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:41 -0700 Subject: nvme: support for zoned namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined in NVM Express TP4053. Zoned namespaces are discovered based on their Command Set Identifier reported in the namespaces Namespace Identification Descriptor list. A successfully discovered Zoned Namespace will be registered with the block layer as a host managed zoned block device with Zone Append command support. A namespace that does not support append is not supported by the driver. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Ajay Joshi Signed-off-by: Aravind Ramesh Signed-off-by: Niklas Cassel Signed-off-by: Matias Bjørling Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- block/Kconfig | 5 +- drivers/nvme/host/Makefile | 1 + drivers/nvme/host/core.c | 97 ++++++++++++++--- drivers/nvme/host/nvme.h | 39 +++++++ drivers/nvme/host/zns.c | 254 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 111 ++++++++++++++++++++ 6 files changed, 492 insertions(+), 15 deletions(-) create mode 100644 drivers/nvme/host/zns.c (limited to 'drivers/nvme') diff --git a/block/Kconfig b/block/Kconfig index 9357d7302398..bbad5e8bbffe 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -86,9 +86,10 @@ config BLK_DEV_ZONED select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables - support for ZAC/ZBC host-managed and host-aware zoned block devices. + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. - Say yes here if you have a ZAC or ZBC storage device. + Say yes here if you have a ZAC, ZBC, or ZNS storage device. config BLK_DEV_THROTTLING bool "Block layer bio throttling support" diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index fc7b26be692d..d7f6a87687b8 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6 +13,7 @@ nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 62b2cdc764da..a8ee10a0cd32 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl) +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) + if (ns->disk && _nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } up_read(&ctrl->namespaces_rwsem); } @@ -1417,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) * this command. */ if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl); + nvme_update_formats(ctrl, &effects); if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); @@ -1532,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx) { #ifdef CONFIG_NVME_MULTIPATH @@ -1552,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, return disk->private_data; } -static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) { if (head) srcu_read_unlock(&head->srcu, idx); @@ -1945,23 +1979,34 @@ static void nvme_update_disk_info(struct gendisk *disk, static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + int ret; u32 iob; /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ - ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; + ns->lba_shift = id->lbaf[lbaf].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; switch (ns->head->ids.csi) { case NVME_CSI_NVM: break; + case NVME_CSI_ZNS: + ret = nvme_update_zone_info(disk, ns, lbaf); + if (ret) { + dev_warn(ctrl->device, + "failed to add zoned namespace:%u ret:%d\n", + ns->head->ns_id, ret); + return ret; + } + break; default: - dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", + dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", ns->head->ids.csi, ns->head->ns_id); return -ENODEV; } @@ -1973,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; @@ -2015,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return 0; } -static int nvme_revalidate_disk(struct gendisk *disk) +static int _nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; @@ -2063,6 +2108,28 @@ out: return ret; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + int ret; + + ret = _nvme_revalidate_disk(disk); + if (ret) + return ret; + +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(disk->queue)) { + struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + + ret = blk_revalidate_disk_zones(disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(disk->queue, + ctrl->max_zone_append); + } +#endif + return ret; +} + static char nvme_pr_type(enum pr_type type) { switch (type) { @@ -2193,6 +2260,7 @@ static const struct block_device_operations nvme_fops = { .release = nvme_release, .getgeo = nvme_getgeo, .revalidate_disk= nvme_revalidate_disk, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; @@ -2219,6 +2287,7 @@ const struct block_device_operations nvme_ns_head_ops = { .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, }; #endif /* CONFIG_NVME_MULTIPATH */ @@ -4446,6 +4515,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fe9424c7097f..13ca90bcd352 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -238,6 +238,9 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; +#ifdef CONFIG_BLK_DEV_ZONED + u32 max_zone_append; +#endif u16 crdt[3]; u16 oncs; u16 oacs; @@ -404,6 +407,9 @@ struct nvme_ns { u16 sgs; u32 sws; u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +#endif unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -571,6 +577,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); +struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx); +void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; @@ -693,6 +702,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) } #endif /* CONFIG_NVME_MULTIPATH */ +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf); + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action); +#else +#define nvme_report_zones NULL + +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action) +{ + return BLK_STS_NOTSUPP; +} + +static inline int nvme_update_zone_info(struct gendisk *disk, + struct nvme_ns *ns, + unsigned lbaf) +{ + dev_warn(ns->ctrl->device, + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); + return -EPROTONOSUPPORT; +} +#endif + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c new file mode 100644 index 000000000000..04e5b991c00c --- /dev/null +++ b/drivers/nvme/host/zns.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include "nvme.h" + +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_zns *id; + int status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (status) { + kfree(id); + return status; + } + + if (id->zasl) + ctrl->max_zone_append = 1 << (id->zasl + 3); + else + ctrl->max_zone_append = ctrl->max_hw_sectors; + kfree(id); + return 0; +} + +int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, + unsigned lbaf) +{ + struct nvme_effects_log *log = ns->head->effects; + struct request_queue *q = disk->queue; + struct nvme_command c = { }; + struct nvme_id_ns_zns *id; + int status; + + /* Driver requires zone append support */ + if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + NVME_CMD_EFFECTS_CSUPP)) { + dev_warn(ns->ctrl->device, + "append not supported for zoned namespace:%d\n", + ns->head->ns_id); + return -EINVAL; + } + + /* Lazily query controller append limit for the first zoned namespace */ + if (!ns->ctrl->max_zone_append) { + status = nvme_set_max_append(ns->ctrl); + if (status) + return status; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.cns = NVME_ID_CNS_CS_NS; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); + if (status) + goto free_data; + + /* + * We currently do not handle devices requiring any of the zoned + * operation characteristics. + */ + if (id->zoc) { + dev_warn(ns->ctrl->device, + "zone operations:%x not supported for namespace:%u\n", + le16_to_cpu(id->zoc), ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze)); + if (!is_power_of_2(ns->zsze)) { + dev_warn(ns->ctrl->device, + "invalid zone size:%llu for namespace:%u\n", + ns->zsze, ns->head->ns_id); + status = -EINVAL; + goto free_data; + } + + q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); +free_data: + kfree(id); + return status; +} + +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ns->disk->queue; + size_t bufsize; + void *buf; + + const size_t min_bufsize = sizeof(struct nvme_zone_report) + + sizeof(struct nvme_zone_descriptor); + + nr_zones = min_t(unsigned int, nr_zones, + get_capacity(ns->disk) >> ilog2(ns->zsze)); + + bufsize = sizeof(struct nvme_zone_report) + + nr_zones * sizeof(struct nvme_zone_descriptor); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + while (bufsize >= min_bufsize) { + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + return NULL; +} + +static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + struct nvme_zone_report *report, + size_t buflen) +{ + struct nvme_command c = { }; + int ret; + + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) + return ret; + + return le64_to_cpu(report->nr_zones); +} + +static int nvme_zone_parse_entry(struct nvme_ns *ns, + struct nvme_zone_descriptor *entry, + unsigned int idx, report_zones_cb cb, + void *data) +{ + struct blk_zone zone = { }; + + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { + dev_err(ns->ctrl->device, "invalid zone type %#x\n", + entry->zt); + return -EINVAL; + } + + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone.cond = entry->zs >> 4; + zone.len = ns->zsze; + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + + return cb(&zone, idx, data); +} + +static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_zone_report *report; + int ret, zone_idx = 0; + unsigned int nz, i; + size_t buflen; + + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen); + if (!report) + return -ENOMEM; + + sector &= ~(ns->zsze - 1); + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { + memset(report, 0, buflen); + ret = __nvme_ns_report_zones(ns, sector, report, buflen); + if (ret < 0) + goto out_free; + + nz = min_t(unsigned int, ret, nr_zones); + if (!nz) + break; + + for (i = 0; i < nz && zone_idx < nr_zones; i++) { + ret = nvme_zone_parse_entry(ns, &report->entries[i], + zone_idx, cb, data); + if (ret) + goto out_free; + zone_idx++; + } + + sector += ns->zsze * nz; + } + + if (zone_idx > 0) + ret = zone_idx; + else + ret = -EINVAL; +out_free: + kvfree(report); + return ret; +} + +int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + if (ns->head->ids.csi == NVME_CSI_ZNS) + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + else + ret = -EINVAL; + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; +} + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *c, enum nvme_zone_mgmt_action action) +{ + c->zms.opcode = nvme_cmd_zone_mgmt_send; + c->zms.nsid = cpu_to_le32(ns->head->ns_id); + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + c->zms.zsa = action; + + if (req_op(req) == REQ_OP_ZONE_RESET_ALL) + c->zms.select_all = 1; + + return BLK_STS_OK; +} diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 95cd03e240a1..1643005d21e3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -374,6 +374,30 @@ struct nvme_id_ns { __u8 vs[3712]; }; +struct nvme_zns_lbafe { + __le64 zsze; + __u8 zdes; + __u8 rsvd9[7]; +}; + +struct nvme_id_ns_zns { + __le16 zoc; + __le16 ozcs; + __le32 mar; + __le32 mor; + __le32 rrl; + __le32 frl; + __u8 rsvd20[2796]; + struct nvme_zns_lbafe lbafe[16]; + __u8 rsvd3072[768]; + __u8 vs[256]; +}; + +struct nvme_id_ctrl_zns { + __u8 zasl; + __u8 rsvd1[4095]; +}; + enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, @@ -392,6 +416,7 @@ enum { enum { NVME_CSI_NVM = 0, + NVME_CSI_ZNS = 2, }; enum { @@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr { __le16 rsvd10[3]; }; +struct nvme_zone_descriptor { + __u8 zt; + __u8 zs; + __u8 za; + __u8 rsvd3[5]; + __le64 zcap; + __le64 zslba; + __le64 wp; + __u8 rsvd32[32]; +}; + +enum { + NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2, +}; + +struct nvme_zone_report { + __le64 nr_zones; + __u8 resv8[56]; + struct nvme_zone_descriptor entries[]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -626,6 +672,9 @@ enum nvme_opcode { nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, nvme_cmd_resv_release = 0x15, + nvme_cmd_zone_mgmt_send = 0x79, + nvme_cmd_zone_mgmt_recv = 0x7a, + nvme_cmd_zone_append = 0x7d, }; #define nvme_opcode_name(opcode) { opcode, #opcode } @@ -764,6 +813,7 @@ struct nvme_rw_command { enum { NVME_RW_LR = 1 << 15, NVME_RW_FUA = 1 << 14, + NVME_RW_APPEND_PIREMAP = 1 << 9, NVME_RW_DSM_FREQ_UNSPEC = 0, NVME_RW_DSM_FREQ_TYPICAL = 1, NVME_RW_DSM_FREQ_RARE = 2, @@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd { __le16 appmask; }; +enum nvme_zone_mgmt_action { + NVME_ZONE_CLOSE = 0x1, + NVME_ZONE_FINISH = 0x2, + NVME_ZONE_OPEN = 0x3, + NVME_ZONE_RESET = 0x4, + NVME_ZONE_OFFLINE = 0x5, + NVME_ZONE_SET_DESC_EXT = 0x10, +}; + +struct nvme_zone_mgmt_send_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le32 cdw12; + __u8 zsa; + __u8 select_all; + __u8 rsvd13[2]; + __le32 cdw14[2]; +}; + +struct nvme_zone_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __le64 slba; + __le32 numd; + __u8 zra; + __u8 zrasf; + __u8 pr; + __u8 rsvd13; + __le32 cdw14[2]; +}; + +enum { + NVME_ZRA_ZONE_REPORT = 0, + NVME_ZRASF_ZONE_REPORT_ALL = 0, + NVME_REPORT_ZONE_PARTIAL = 1, +}; + /* Features */ enum { @@ -1300,6 +1397,8 @@ struct nvme_command { struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_zone_mgmt_send_cmd zms; + struct nvme_zone_mgmt_recv_cmd zmr; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; struct nvmf_common_command fabrics; @@ -1433,6 +1532,18 @@ enum { NVME_SC_DISCOVERY_RESTART = 0x190, NVME_SC_AUTH_REQUIRED = 0x191, + /* + * I/O Command Set Specific - Zoned commands: + */ + NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8, + NVME_SC_ZONE_FULL = 0x1b9, + NVME_SC_ZONE_READ_ONLY = 0x1ba, + NVME_SC_ZONE_OFFLINE = 0x1bb, + NVME_SC_ZONE_INVALID_WRITE = 0x1bc, + NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd, + NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be, + NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf, + /* * Media and Data Integrity Errors: */ -- cgit v1.2.3 From 764075fdcb2f09baa23fdc4df4b79741e5c39b57 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 5 Jul 2020 00:57:55 -0700 Subject: nvme: expose reconnect_delay and ctrl_loss_tmo via sysfs This is useful information, and moreover its it's useful to be able to alter these parameters per controller after it has been established. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a8ee10a0cd32..4aaffc4fa150 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3535,6 +3535,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev, } static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + else if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sprintf(buf, "off\n"); + return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err || v > UINT_MAX) + return -EINVAL; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3552,6 +3612,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, NULL }; -- cgit v1.2.3 From 972b13e29d40753a4ab2cd9735bd6ce26e91d6a6 Mon Sep 17 00:00:00 2001 From: David Fugate Date: Thu, 2 Jul 2020 15:31:22 -0600 Subject: nvme: document quirked Intel models Documented model names of Intel SSDs requiring quirks. Signed-off-by: David Fugate Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b3538141ec11..83585ed5ec1f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3079,16 +3079,16 @@ static const struct pci_error_handlers nvme_err_handler = { }; static const struct pci_device_id nvme_id_table[] = { - { PCI_VDEVICE(INTEL, 0x0953), + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a53), + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a54), + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, - { PCI_VDEVICE(INTEL, 0x0a55), + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ -- cgit v1.2.3 From c25c853ef60d2c0d37420b9e4b81bdd49e90b46e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:22 +0800 Subject: nvme-pci: remove redundant segment validation We've validated the segment counts before calling nvme_map_data(), so there is no need to validate again in nvme_pci_use_sgls(, which is only called from nvme_map_data(). Signed-off-by: Baolin Wang Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 83585ed5ec1f..9216cbd2fd43 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -501,9 +501,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; - if (nseg == 0) - return false; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) -- cgit v1.2.3 From ee0d96d3225f5e3688391a033b9e373b5b603315 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:20 +0800 Subject: nvme-pci: fix some comments issues Fix comment typos and remove whitespaces before tabs to cleanup checkpatch errors. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9216cbd2fd43..a4725b37b288 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1258,9 +1258,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) } /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, @@ -2000,7 +2000,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* - * If there is no interupt available for queues, ensure that + * If there is no interrupt available for queues, ensure that * the default queue is set to 1. The affinity set size is * also set to one, but the irq core ignores it for this case. * -- cgit v1.2.3 From 4e523547e2bf755d40cb10e85795c2f9620ff3fb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:21 +0800 Subject: nvme-pci: add a blank line after declarations Add a blank line after declarations to make code more readable. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a4725b37b288..41c2055c6fc0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1017,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; @@ -1399,6 +1400,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); q_depth = div_u64(mem_per_q, entry_size); @@ -2873,6 +2875,7 @@ static void nvme_reset_done(struct pci_dev *pdev) static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_disable_prepare_reset(dev, true); } @@ -3003,6 +3006,7 @@ unfreeze: static int nvme_simple_suspend(struct device *dev) { struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + return nvme_disable_prepare_reset(ndev, true); } -- cgit v1.2.3 From 9056fc9fc514ecd2457a59c575863ecb07c4fa5e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:23 +0800 Subject: nvme-pci: use the consistent return type of nvme_pci_iod_alloc_size() The nvme_pci_iod_alloc_size() should return 'size_t' type to be consistent with the sizeof return value. Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 41c2055c6fc0..c9083c87c6cb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -362,7 +362,7 @@ static int nvme_pci_npages_sgl(unsigned int num_seg) return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); } -static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, +static size_t nvme_pci_iod_alloc_size(struct nvme_dev *dev, unsigned int size, unsigned int nseg, bool use_sgl) { size_t alloc_size; -- cgit v1.2.3 From 359c1f88ab646174bf82d18454c3ee2a38462ac8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 3 Jul 2020 10:49:24 +0800 Subject: nvme-pci: use standard block status symbolic names Signed-off-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c9083c87c6cb..45e94f016ec2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -762,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); if (bv->bv_len > first_prp_len) cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - return 0; + return BLK_STS_OK; } static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, @@ -780,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; - return 0; + return BLK_STS_OK; } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, @@ -844,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, if (dma_mapping_error(dev->dev, iod->meta_dma)) return BLK_STS_IOERR; cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); - return 0; + return BLK_STS_OK; } /* -- cgit v1.2.3 From 3913f4f3a65ca9ed6ba7e4678fff10a6e7b42dbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 16:18:27 +0200 Subject: nvme: remove ns->disk checks By the time a namespace is added to ctrl->namespaces list and thus can be looked up ns->disk has been assigned, and it it never cleared. Remove all the checks for ns->disk being NULL. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4aaffc4fa150..3d00ea4e7146 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) * Revalidating a dead namespace sets capacity to 0. This will end * buffered writers dirtying pages that can't be synced. */ - if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; blk_set_queue_dying(ns->queue); /* Forcibly unquiesce queues to avoid blocking dispatch */ @@ -1429,7 +1429,7 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && _nvme_revalidate_disk(ns->disk)) + if (_nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); else if (blk_queue_is_zoned(ns->disk->queue)) { /* @@ -3933,7 +3933,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk && ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->flags & GENHD_FL_UP) { del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -3964,7 +3964,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (ns->disk && revalidate_disk(ns->disk)) + if (revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else -- cgit v1.2.3 From e15864f8ea05b24071b07300459ae7e511d0b938 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 14 Jul 2020 23:18:23 +0200 Subject: block: add max_open_zones to blk-sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new max_open_zones definition in the sysfs documentation. This definition will be common for all devices utilizing the zoned block device support in the kernel. Export max open zones according to this new definition for NVMe Zoned Namespace devices, ZAC ATA devices (which are treated as SCSI devices by the kernel), and ZBC SCSI devices. Add the new max_open_zones member to struct request_queue, rather than as a queue limit, since this property cannot be split across stacking drivers. Signed-off-by: Niklas Cassel Reviewed-by: Javier González Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 9 +++++++++ Documentation/block/queue-sysfs.rst | 7 +++++++ block/blk-sysfs.c | 15 +++++++++++++++ drivers/nvme/host/zns.c | 1 + drivers/scsi/sd_zbc.c | 4 ++++ include/linux/blkdev.h | 25 +++++++++++++++++++++++++ 6 files changed, 61 insertions(+) (limited to 'drivers/nvme') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index ed8c14f161ee..f151d9cf90de 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -273,6 +273,15 @@ Description: device ("host-aware" or "host-managed" zone model). For regular block devices, the value is always 0. +What: /sys/block//queue/max_open_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, + is limited by this value. If this value is 0, there is no limit. + What: /sys/block//queue/chunk_sectors Date: September 2016 Contact: Hannes Reinecke diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index 6a8513af9201..f01cf8530ae4 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity data that will be submitted by the block layer core to the associated block driver. +max_open_zones (RO) +------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. +If this value is 0, there is no limit. + max_sectors_kb (RW) ------------------- This is the maximum number of kilobytes that the block layer will allow diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index be67952e7be2..414f04579d77 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -306,6 +306,11 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_nr_zones(q), page); } +static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_open_zones(q), page); +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -668,6 +673,11 @@ static struct queue_sysfs_entry queue_nr_zones_entry = { .show = queue_nr_zones_show, }; +static struct queue_sysfs_entry queue_max_open_zones_entry = { + .attr = {.name = "max_open_zones", .mode = 0444 }, + .show = queue_max_open_zones_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, @@ -766,6 +776,7 @@ static struct attribute *queue_attrs[] = { &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, + &queue_max_open_zones_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -793,6 +804,10 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, (!q->mq_ops || !q->mq_ops->timeout)) return 0; + if (attr == &queue_max_open_zones_entry.attr && + !blk_queue_is_zoned(q)) + return 0; + return attr->mode; } diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 04e5b991c00c..3d80b9cf6bfc 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -96,6 +96,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); free_data: kfree(id); return status; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 183a20720da9..aa3564139b40 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -717,6 +717,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); + if (sdkp->zones_max_open == U32_MAX) + blk_queue_max_open_zones(q, 0); + else + blk_queue_max_open_zones(q, sdkp->zones_max_open); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index de7adc59b993..c8beb8bbdb08 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -513,6 +513,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; + unsigned int max_open_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -722,6 +723,17 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return true; return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap); } + +static inline void blk_queue_max_open_zones(struct request_queue *q, + unsigned int max_open_zones) +{ + q->max_open_zones = max_open_zones; +} + +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return q->max_open_zones; +} #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { @@ -737,6 +749,10 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q, { return 0; } +static inline unsigned int queue_max_open_zones(const struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) @@ -1519,6 +1535,15 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return 0; } +static inline unsigned int bdev_max_open_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_open_zones(q); + return 0; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 659bf827ba8f1183b714341d8a1d4b1e446178d9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 14 Jul 2020 23:18:24 +0200 Subject: block: add max_active_zones to blk-sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new max_active zones definition in the sysfs documentation. This definition will be common for all devices utilizing the zoned block device support in the kernel. Export max_active_zones according to this new definition for NVMe Zoned Namespace devices, ZAC ATA devices (which are treated as SCSI devices by the kernel), and ZBC SCSI devices. Add the new max_active_zones member to struct request_queue, rather than as a queue limit, since this property cannot be split across stacking drivers. For SCSI devices, even though max active zones is not part of the ZBC/ZAC spec, export max_active_zones as 0, signifying "no limit". Signed-off-by: Niklas Cassel Reviewed-by: Javier González Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 9 +++++++++ Documentation/block/queue-sysfs.rst | 7 +++++++ block/blk-sysfs.c | 14 +++++++++++++- drivers/nvme/host/zns.c | 1 + drivers/scsi/sd_zbc.c | 1 + include/linux/blkdev.h | 25 +++++++++++++++++++++++++ 6 files changed, 56 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index f151d9cf90de..2322eb748b38 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -273,6 +273,15 @@ Description: device ("host-aware" or "host-managed" zone model). For regular block devices, the value is always 0. +What: /sys/block//queue/max_active_zones +Date: July 2020 +Contact: Niklas Cassel +Description: + For zoned block devices (zoned attribute indicating + "host-managed" or "host-aware"), the sum of zones belonging to + any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, + is limited by this value. If this value is 0, there is no limit. + What: /sys/block//queue/max_open_zones Date: July 2020 Contact: Niklas Cassel diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index f01cf8530ae4..f261a5c84170 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity data that will be submitted by the block layer core to the associated block driver. +max_active_zones (RO) +--------------------- +For zoned block devices (zoned attribute indicating "host-managed" or +"host-aware"), the sum of zones belonging to any of the zone states: +EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. +If this value is 0, there is no limit. + max_open_zones (RO) ------------------- For zoned block devices (zoned attribute indicating "host-managed" or diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 414f04579d77..7dda709f3ccb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -311,6 +311,11 @@ static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) return queue_var_show(queue_max_open_zones(q), page); } +static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_active_zones(q), page); +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -678,6 +683,11 @@ static struct queue_sysfs_entry queue_max_open_zones_entry = { .show = queue_max_open_zones_show, }; +static struct queue_sysfs_entry queue_max_active_zones_entry = { + .attr = {.name = "max_active_zones", .mode = 0444 }, + .show = queue_max_active_zones_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = 0644 }, .show = queue_nomerges_show, @@ -777,6 +787,7 @@ static struct attribute *queue_attrs[] = { &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, + &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, @@ -804,7 +815,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, (!q->mq_ops || !q->mq_ops->timeout)) return 0; - if (attr == &queue_max_open_zones_entry.attr && + if ((attr == &queue_max_open_zones_entry.attr || + attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 3d80b9cf6bfc..57cfd78731fb 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -97,6 +97,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, q->limits.zoned = BLK_ZONED_HM; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); + blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); free_data: kfree(id); return status; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index aa3564139b40..d8b2c49d645b 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -721,6 +721,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_open_zones(q, 0); else blk_queue_max_open_zones(q, sdkp->zones_max_open); + blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8beb8bbdb08..285b59cfc064 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -514,6 +514,7 @@ struct request_queue { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int max_open_zones; + unsigned int max_active_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -734,6 +735,17 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return q->max_open_zones; } + +static inline void blk_queue_max_active_zones(struct request_queue *q, + unsigned int max_active_zones) +{ + q->max_active_zones = max_active_zones; +} + +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return q->max_active_zones; +} #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { @@ -753,6 +765,10 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return 0; } +static inline unsigned int queue_max_active_zones(const struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) @@ -1544,6 +1560,15 @@ static inline unsigned int bdev_max_open_zones(struct block_device *bdev) return 0; } +static inline unsigned int bdev_max_active_zones(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return queue_max_active_zones(q); + return 0; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From eca9e8271e596b73e3ccefdcd26eb85b46b21a34 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Jul 2020 13:57:32 +0300 Subject: nvme: remove an unnecessary condition "v" is an unsigned int so it can't be more than UINT_MAX. Removing this check makes it easier to preserve the error code as well. Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3d00ea4e7146..5fb5e8ba531b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3586,8 +3586,8 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, int err; err = kstrtou32(buf, 10, &v); - if (err || v > UINT_MAX) - return -EINVAL; + if (err) + return err; ctrl->opts->reconnect_delay = v; return count; -- cgit v1.2.3 From 5887450b69e72d4b472a7d049773f6a01bc24cd7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 13 Jul 2020 14:25:21 +0800 Subject: nvme: remove redundant validation in nvme_start_ctrl() We've already validated the 'kato' in nvme_start_keep_alive(), thus no need to validate it again in nvme_start_ctrl(). Remove it. Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5fb5e8ba531b..f49085bcaa42 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4313,8 +4313,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl) { - if (ctrl->kato) - nvme_start_keep_alive(ctrl); + nvme_start_keep_alive(ctrl); nvme_enable_aen(ctrl); -- cgit v1.2.3 From 6c3c05b087ada8947cd31895f67e433070446234 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 16 Jul 2020 17:51:37 -0700 Subject: nvme-core: replace ctrl page size with a macro Saving the nvme controller's page size was from a time when the driver tried to use different sized pages, but this value is always set to a constant, and has been this way for some time. Remove the 'page_size' field and replace its usage with the constant value. This also lets the compiler make some micro-optimizations in the io path, and that's always a good thing. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 19 ++++++------------- drivers/nvme/host/nvme.h | 9 ++++++++- drivers/nvme/host/pci.c | 47 +++++++++++++++++++++++------------------------ 3 files changed, 37 insertions(+), 38 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f49085bcaa42..1d7c7afb1348 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2345,12 +2345,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min, page_shift = 12; + unsigned dev_page_min; int ret; ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); @@ -2360,20 +2355,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) } dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; - if (page_shift < dev_page_min) { + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); return -ENODEV; } - ctrl->page_size = 1 << page_shift; - if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) ctrl->ctrl_config = NVME_CC_CSS_CSI; else ctrl->ctrl_config = NVME_CC_CSS_NVM; - ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; @@ -2423,13 +2416,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, if (ctrl->max_hw_sectors) { u32 max_segments = - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 13ca90bcd352..2ee91a3dd8e0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -37,6 +37,14 @@ extern unsigned int admin_timeout; #define NVME_INLINE_METADATA_SG_CNT 1 #endif +/* + * Default to a 4K page size, with the intention to update this + * path in the future to accommodate architectures with differing + * kernel and IO page sizes. + */ +#define NVME_CTRL_PAGE_SHIFT 12 +#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) + extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; @@ -234,7 +242,6 @@ struct nvme_ctrl { u32 queue_count; u64 cap; - u32 page_size; u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 45e94f016ec2..0ab0dcf84d5e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -348,8 +348,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, - dev->ctrl.page_size); + unsigned nprps = DIV_ROUND_UP(size + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -515,7 +515,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; @@ -582,34 +582,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->ctrl.page_size; - int offset = dma_addr & (page_size - 1); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; - length -= (page_size - offset); + length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { iod->first_dma = 0; goto done; } - dma_len -= (page_size - offset); + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); if (dma_len) { - dma_addr += (page_size - offset); + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= page_size) { + if (length <= NVME_CTRL_PAGE_SIZE) { iod->first_dma = dma_addr; goto done; } - nprps = DIV_ROUND_UP(length, page_size); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -628,7 +627,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == page_size >> 3) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) @@ -639,9 +638,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= page_size; - dma_addr += page_size; - length -= page_size; + dma_len -= NVME_CTRL_PAGE_SIZE; + dma_addr += NVME_CTRL_PAGE_SIZE; + length -= NVME_CTRL_PAGE_SIZE; if (length <= 0) break; if (dma_len > 0) @@ -751,8 +750,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, struct bio_vec *bv) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); - unsigned int first_prp_len = dev->ctrl.page_size - offset; + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->first_dma)) @@ -794,7 +793,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2) + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); @@ -1396,12 +1395,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, { int q_depth = dev->q_depth; unsigned q_size_aligned = roundup(q_depth * entry_size, - dev->ctrl.page_size); + NVME_CTRL_PAGE_SIZE); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1816,6 +1815,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) { + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; u64 dma_addr = dev->host_mem_descs_dma; struct nvme_command c; int ret; @@ -1824,8 +1824,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); c.features.dword11 = cpu_to_le32(bits); - c.features.dword12 = cpu_to_le32(dev->host_mem_size >> - ilog2(dev->ctrl.page_size)); + c.features.dword12 = cpu_to_le32(host_mem_size); c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); @@ -1845,7 +1844,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev) for (i = 0; i < dev->nr_host_mem_descs; i++) { struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; - size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], le64_to_cpu(desc->addr), @@ -1897,7 +1896,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, break; descs[i].addr = cpu_to_le64(dma_addr); - descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); i++; } @@ -1913,7 +1912,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, out_free_bufs: while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; dma_free_attrs(dev->dev, size, bufs[i], le64_to_cpu(descs[i].addr), -- cgit v1.2.3 From b13c6393be7dfa780835fd4400d16d0ec1cf128f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 20 Jul 2020 15:23:37 +0200 Subject: nvme-pci: use max of PRP or SGL for iod size >From the initial implementation of NVMe SGL kernel support commit a7a7cbe353a5 ("nvme-pci: add SGL support") with addition of the commit 943e942e6266 ("nvme-pci: limit max IO size and segments to avoid high order allocations") now there is only caller left for nvme_pci_iod_alloc_size() which statically passes true for last parameter that calculates allocation size based on SGL since we need size of biggest command supported for mempool allocation. This patch modifies the helper functions nvme_pci_iod_alloc_size() such that it is now uses maximum of PRP and SGL size for iod allocation size calculation. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0ab0dcf84d5e..96c1ae3712be 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -346,9 +346,9 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) +static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(size + NVME_CTRL_PAGE_SIZE, + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -357,22 +357,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) * Calculates the number of pages needed for the SGL segments. For example a 4k * page can accommodate 256 SGL descriptors. */ -static int nvme_pci_npages_sgl(unsigned int num_seg) +static int nvme_pci_npages_sgl(void) { - return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), + PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg, bool use_sgl) +static size_t nvme_pci_iod_alloc_size(void) { - size_t alloc_size; - - if (use_sgl) - alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); - else - alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - return alloc_size + sizeof(struct scatterlist) * nseg; + return sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -2811,8 +2807,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) * Double check that our mempool alloc size will cover the biggest * command we support. */ - alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, - NVME_MAX_SEGS, true); + alloc_size = nvme_pci_iod_alloc_size(); WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, -- cgit v1.2.3 From df4f9bc4fb9cb338b1c21cf99c6e905d3b78e91d Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 9 Jul 2020 11:43:33 -0700 Subject: nvme-pci: add support for ACPI StorageD3Enable property This patch implements a solution for a BIOS hack used on some currently shipping Intel systems to change driver power management policy for PCIe NVMe drives. Some newer Intel platforms, like some Comet Lake systems, require that PCIe devices use D3 when doing suspend-to-idle in order to allow the platform to realize maximum power savings. This is particularly needed to support ATX power supply shutdown on desktop systems. In order to ensure this happens for root ports with storage devices, Microsoft apparently created this ACPI _DSD property as a way to influence their driver policy. To my knowledge this property has not been discussed with the NVME specification body. Though the solution is not ideal, it addresses a problem that also affects Linux since the NVMe driver's default policy of using NVMe APST during suspend-to-idle prevents the PCI root port from going to D3 and leads to higher power consumption for these platforms. The power consumption difference may be negligible on laptop systems, but many watts on desktop systems when the ATX power supply is blocked from powering down. The patch creates a new nvme_acpi_storage_d3 function to check for the StorageD3Enable property during probe and enables D3 as a quirk if set. It also provides a 'noacpi' module parameter to allow skipping the quirk if needed. Tested with: - PM961 NVMe SED Samsung 512GB - INTEL SSDPEKKF512G8 Link: https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/power-management-for-storage-hardware-devices-intro Signed-off-by: David E. Box Reviewed-by: Rafael J. Wysocki Signed-off-by: Christoph Hellwig --- drivers/acpi/property.c | 3 +++ drivers/nvme/host/pci.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index e601c4511a8b..c2e2ae774a19 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -45,6 +45,9 @@ static const guid_t prp_guids[] = { /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), + /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ + GUID_INIT(0x5025030f, 0x842f, 0x4ab4, + 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 96c1ae3712be..61e612d52d61 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4,6 +4,7 @@ * Copyright (c) 2011-2014, Intel Corporation. */ +#include #include #include #include @@ -94,6 +95,10 @@ static unsigned int poll_queues; module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); +static bool noacpi; +module_param(noacpi, bool, 0444); +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); + struct nvme_dev; struct nvme_queue; @@ -2754,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +#ifdef CONFIG_ACPI +static bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + struct acpi_device *adev; + struct pci_dev *root; + acpi_handle handle; + acpi_status status; + u8 val; + + /* + * Look for _DSD property specifying that the storage device on the port + * must use D3 to support deep platform power savings during + * suspend-to-idle. + */ + root = pcie_find_root_port(dev); + if (!root) + return false; + + adev = ACPI_COMPANION(&root->dev); + if (!adev) + return false; + + /* + * The property is defined in the PXSX device for South complex ports + * and in the PEGP device for North complex ports. + */ + status = acpi_get_handle(adev->handle, "PXSX", &handle); + if (ACPI_FAILURE(status)) { + status = acpi_get_handle(adev->handle, "PEGP", &handle); + if (ACPI_FAILURE(status)) + return false; + } + + if (acpi_bus_get_device(handle, &adev)) + return false; + + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", + &val)) + return false; + return val == 1; +} +#else +static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) +{ + return false; +} +#endif /* CONFIG_ACPI */ + static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; @@ -2803,6 +2856,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= check_vendor_combination_bug(pdev); + if (!noacpi && nvme_acpi_storage_d3(pdev)) { + /* + * Some systems use a bios work around to ask for D3 on + * platforms that support kernel managed suspend. + */ + dev_info(&pdev->dev, + "platform quirk: setting simple suspend\n"); + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; + } + /* * Double check that our mempool alloc size will cover the biggest * command we support. -- cgit v1.2.3 From 287f329e3131a4e7d7ce9a7e8e6189698d1763f8 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 13 Jul 2020 11:53:29 +0300 Subject: nvme-rdma: use new shared CQ mechanism Has the driver use shared CQs providing ~10%-20% improvement as seen in the patch introducing shared CQs. Instead of opening a CQ for each QP per controller connected, a CQ for each QP will be provided by the RDMA core driver that will be shared between the QPs on that core reducing interrupt overhead. Signed-off-by: Yamin Friedman Signed-off-by: Max Gurtovoy Reviewed-by: Or Gerlitz Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 77 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 26 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index e881f879ac63..467da08db309 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -96,6 +96,7 @@ struct nvme_rdma_queue { int cm_error; struct completion cm_done; bool pi_support; + int cq_size; }; struct nvme_rdma_ctrl { @@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.recv_cq = queue->ib_cq; if (queue->pi_support) init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + init_attr.qp_context = queue; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -409,6 +411,14 @@ out_err: return NULL; } +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else + ib_cq_pool_put(queue->ib_cq, queue->cq_size); +} + static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { struct nvme_rdma_device *dev; @@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) * the destruction of the QP shouldn't use rdma_cm API. */ ib_destroy_qp(queue->qp); - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } +static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } + + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + return ret; + } + + return 0; +} + static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ - int comp_vector, idx = nvme_rdma_queue_idx(queue); - enum ib_poll_context poll_ctx; int ret, pages_per_mr; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* - * Spread I/O queues completion vectors according their queue index. - * Admin queues can always go on completion vector 0. - */ - comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; - - /* Polling queues need direct cq polling context */ - if (nvme_rdma_poll_queue(queue)) - poll_ctx = IB_POLL_DIRECT; - else - poll_ctx = IB_POLL_SOFTIRQ; - /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(ibdev, queue, - cq_factor * queue->queue_size + 1, - comp_vector, poll_ctx); - if (IS_ERR(queue->ib_cq)) { - ret = PTR_ERR(queue->ib_cq); + queue->cq_size = cq_factor * queue->queue_size + 1; + + ret = nvme_rdma_create_cq(ibdev, queue); + if (ret) goto out_put_dev; - } ret = nvme_rdma_create_qp(queue, send_wr_factor); if (ret) @@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_ring; } @@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) if (ret) { dev_err(queue->ctrl->ctrl.device, "failed to initialize PI MR pool sized %d for QID %d\n", - queue->queue_size, idx); + queue->queue_size, nvme_rdma_queue_idx(queue)); goto out_destroy_mr_pool; } } @@ -540,7 +565,7 @@ out_destroy_ring: out_destroy_qp: rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: - ib_free_cq(queue->ib_cq); + nvme_rdma_free_cq(queue); out_put_dev: nvme_rdma_dev_put(queue->device); return ret; @@ -1163,7 +1188,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, const char *op) { - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct nvme_rdma_ctrl *ctrl = queue->ctrl; if (ctrl->ctrl.state == NVME_CTRL_LIVE) @@ -1706,7 +1731,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); - struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_queue *queue = wc->qp->qp_context; struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); -- cgit v1.2.3 From ca0f1a8055be2a04073af435dc68419334481638 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 13 Jul 2020 11:53:30 +0300 Subject: nvmet-rdma: use new shared CQ mechanism Has the driver use shared CQs providing ~10%-20% improvement when multiple disks are used. Instead of opening a CQ for each QP per controller, a CQ for each core will be provided by the RDMA core driver that will be shared between the QPs on that core reducing interrupt overhead. Signed-off-by: Yamin Friedman Signed-off-by: Max Gurtovoy Reviewed-by: Or Gerlitz Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 6731e0349480..3ccb59260b4a 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; u16 status = 0; WARN_ON(rsp->n_rdma <= 0); @@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); - struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_queue *queue = wc->qp->qp_context; struct nvmet_rdma_rsp *rsp; if (unlikely(wc->status != IB_WC_SUCCESS)) { @@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) */ nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; - queue->cq = ib_alloc_cq(ndev->device, queue, - nr_cqe + 1, queue->comp_vector, - IB_POLL_WORKQUEUE); + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); if (IS_ERR(queue->cq)) { ret = PTR_ERR(queue->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -1322,7 +1321,7 @@ out: err_destroy_qp: rdma_destroy_qp(queue->cm_id); err_destroy_cq: - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, nr_cqe + 1); goto out; } @@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) if (queue->cm_id) rdma_destroy_id(queue->cm_id); ib_destroy_qp(queue->qp); - ib_free_cq(queue->cq); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) -- cgit v1.2.3 From 7774e77ebedcfeeac8cce61533b590269650dcf7 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 19 Jul 2020 20:32:02 -0700 Subject: nvmet: use xarray for ctrl ns storing This patch replaces the ctrl->namespaces tracking from linked list to xarray and improves the performance when accessing one namespce :- XArray vs Default:- IOPS and BW (more the better) increase BW (~1.8%):- --------------------------------------------------- XArray :- read: IOPS=160k, BW=626MiB/s (656MB/s)(18.3GiB/30001msec) read: IOPS=160k, BW=626MiB/s (656MB/s)(18.3GiB/30001msec) read: IOPS=162k, BW=631MiB/s (662MB/s)(18.5GiB/30001msec) Default:- read: IOPS=156k, BW=609MiB/s (639MB/s)(17.8GiB/30001msec) read: IOPS=157k, BW=613MiB/s (643MB/s)(17.0GiB/30001msec) read: IOPS=160k, BW=626MiB/s (656MB/s)(18.3GiB/30001msec) Submission latency (less the better) decrease (~8.3%):- ------------------------------------------------------- XArray:- slat (usec): min=7, max=8386, avg=11.19, stdev=5.96 slat (usec): min=7, max=441, avg=11.09, stdev=4.48 slat (usec): min=7, max=1088, avg=11.21, stdev=4.54 Default :- slat (usec): min=8, max=2826.5k, avg=23.96, stdev=3911.50 slat (usec): min=8, max=503, avg=12.52, stdev=5.07 slat (usec): min=8, max=2384, avg=12.50, stdev=5.28 CPU Usage (less the better) decrease (~5.2%):- ---------------------------------------------- XArray:- cpu : usr=1.84%, sys=18.61%, ctx=949471, majf=0, minf=250 cpu : usr=1.83%, sys=18.41%, ctx=950262, majf=0, minf=237 cpu : usr=1.82%, sys=18.82%, ctx=957224, majf=0, minf=234 Default:- cpu : usr=1.70%, sys=19.21%, ctx=858196, majf=0, minf=251 cpu : usr=1.82%, sys=19.98%, ctx=929720, majf=0, minf=227 cpu : usr=1.83%, sys=20.33%, ctx=947208, majf=0, minf=235. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 17 ++++------- drivers/nvme/target/core.c | 64 +++++++++++++---------------------------- drivers/nvme/target/nvmet.h | 3 +- 3 files changed, 27 insertions(+), 57 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 95bb3bc4e335..55918fcef80b 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, u64 data_units_read = 0, data_units_written = 0; struct nvmet_ns *ns; struct nvmet_ctrl *ctrl; + unsigned long idx; ctrl = req->sq->ctrl; - - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; @@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); data_units_written += DIV_ROUND_UP( part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); - } - rcu_read_unlock(); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 count = 0; if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) if (ns->anagrpid == grpid) desc->nsids[count++] = cpu_to_le32(ns->nsid); - rcu_read_unlock(); } desc->grpid = cpu_to_le32(grpid); @@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; + unsigned long idx; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); __le32 *list; u16 status = 0; @@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req) goto out; } - rcu_read_lock(); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { if (ns->nsid <= min_nsid) continue; list[i++] = cpu_to_le32(ns->nsid); if (i == buf_size / sizeof(__le32)) break; } - rcu_read_unlock(); status = nvmet_copy_to_sgl(req, 0, list, buf_size); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 9cdc39c8b729..59621b816f6e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) { - struct nvmet_ns *ns; + unsigned long nsid = 0; + struct nvmet_ns *cur; + unsigned long idx; - if (list_empty(&subsys->namespaces)) - return 0; + xa_for_each(&subsys->namespaces, idx, cur) + nsid = cur->nsid; - ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); - return ns->nsid; + return nsid; } static u32 nvmet_async_event_result(struct nvmet_async_event *aen) @@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) cancel_delayed_work_sync(&ctrl->ka_work); } -static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, - __le32 nsid) -{ - struct nvmet_ns *ns; - - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { - if (ns->nsid == le32_to_cpu(nsid)) - return ns; - } - - return NULL; -} - struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) { struct nvmet_ns *ns; - rcu_read_lock(); - ns = __nvmet_find_namespace(ctrl, nsid); + ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid)); if (ns) percpu_ref_get(&ns->ref); - rcu_read_unlock(); return ns; } @@ -586,24 +572,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ns->nsid > subsys->max_nsid) subsys->max_nsid = ns->nsid; - /* - * The namespaces list needs to be sorted to simplify the implementation - * of the Identify Namepace List subcommand. - */ - if (list_empty(&subsys->namespaces)) { - list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); - } else { - struct nvmet_ns *old; - - list_for_each_entry_rcu(old, &subsys->namespaces, dev_link, - lockdep_is_held(&subsys->lock)) { - BUG_ON(ns->nsid == old->nsid); - if (ns->nsid < old->nsid) - break; - } + ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); + if (ret) + goto out_restore_subsys_maxnsid; - list_add_tail_rcu(&ns->dev_link, &old->dev_link); - } subsys->nr_namespaces++; nvmet_ns_changed(subsys, ns->nsid); @@ -612,6 +584,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; + +out_restore_subsys_maxnsid: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -630,7 +606,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) goto out_unlock; ns->enabled = false; - list_del_rcu(&ns->dev_link); + xa_erase(&ns->subsys->namespaces, ns->nsid); if (ns->nsid == subsys->max_nsid) subsys->max_nsid = nvmet_max_nsid(subsys); @@ -681,7 +657,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) if (!ns) return NULL; - INIT_LIST_HEAD(&ns->dev_link); init_completion(&ns->disable_done); ns->nsid = nsid; @@ -1263,14 +1238,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, struct nvmet_req *req) { struct nvmet_ns *ns; + unsigned long idx; if (!req->p2p_client) return; ctrl->p2p_client = get_device(req->p2p_client); - list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link, - lockdep_is_held(&ctrl->subsys->lock)) + xa_for_each(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); } @@ -1523,7 +1498,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kref_init(&subsys->ref); mutex_init(&subsys->lock); - INIT_LIST_HEAD(&subsys->namespaces); + xa_init(&subsys->namespaces); INIT_LIST_HEAD(&subsys->ctrls); INIT_LIST_HEAD(&subsys->hosts); @@ -1535,8 +1510,9 @@ static void nvmet_subsys_free(struct kref *ref) struct nvmet_subsys *subsys = container_of(ref, struct nvmet_subsys, ref); - WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + xa_destroy(&subsys->namespaces); kfree(subsys->subsysnqn); kfree_rcu(subsys->model, rcuhead); kfree(subsys); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6f8bd6a93575..49e14111446c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -52,7 +52,6 @@ (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) struct nvmet_ns { - struct list_head dev_link; struct percpu_ref ref; struct block_device *bdev; struct file *file; @@ -219,7 +218,7 @@ struct nvmet_subsys { struct mutex lock; struct kref ref; - struct list_head namespaces; + struct xarray namespaces; unsigned int nr_namespaces; unsigned int max_nsid; u16 cntlid_min; -- cgit v1.2.3 From 4212f4e94633f3403c3871d11213badc50b3f6e4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 22 Jul 2020 16:32:18 -0700 Subject: nvme: document nvme controller states We are starting to see some non-trivial states so lets start documenting them. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2ee91a3dd8e0..92629758b77c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -181,6 +181,20 @@ static inline u16 nvme_req_qid(struct request *req) */ #define NVME_QUIRK_DELAY_AMOUNT 2300 +/* + * enum nvme_ctrl_state: Controller state + * + * @NVME_CTRL_NEW: New controller just allocated, initial state + * @NVME_CTRL_LIVE: Controller is connected and I/O capable + * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) + * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the + * transport + * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during + * shutdown or removal. In this case we forcibly + * kill all inflight I/O as they have no chance to + * complete + */ enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, -- cgit v1.2.3 From ecca390e80561debbfdb4dc96bf94595136889fa Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 22 Jul 2020 16:32:19 -0700 Subject: nvme: fix deadlock in disconnect during scan_work and/or ana_work A deadlock happens in the following scenario with multipath: 1) scan_work(nvme0) detects a new nsid while nvme0 is an optimized path to it, path nvme1 happens to be inaccessible. 2) Before scan_work is complete nvme0 disconnect is initiated nvme_delete_ctrl_sync() sets nvme0 state to NVME_CTRL_DELETING 3) scan_work(1) attempts to submit IO, but nvme_path_is_optimized() observes nvme0 is not LIVE. Since nvme1 is a possible path IO is requeued and scan_work hangs. -- Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 -- 4) Delete also hangs in flush_work(ctrl->scan_work) from nvme_remove_namespaces(). Similiarly a deadlock with ana_work may happen: if ana_work has started and calls nvme_mpath_set_live and device_add_disk, it will trigger I/O. When we trigger disconnect I/O will block because our accessible (optimized) path is disconnecting, but the alternate path is inaccessible, so I/O blocks. Then disconnect tries to flush the ana_work and hangs. [ 605.550896] Workqueue: nvme-wq nvme_ana_work [nvme_core] [ 605.552087] Call Trace: [ 605.552683] __schedule+0x2b9/0x6c0 [ 605.553507] schedule+0x42/0xb0 [ 605.554201] io_schedule+0x16/0x40 [ 605.555012] do_read_cache_page+0x438/0x830 [ 605.556925] read_cache_page+0x12/0x20 [ 605.557757] read_dev_sector+0x27/0xc0 [ 605.558587] amiga_partition+0x4d/0x4c5 [ 605.561278] check_partition+0x154/0x244 [ 605.562138] rescan_partitions+0xae/0x280 [ 605.563076] __blkdev_get+0x40f/0x560 [ 605.563830] blkdev_get+0x3d/0x140 [ 605.564500] __device_add_disk+0x388/0x480 [ 605.565316] device_add_disk+0x13/0x20 [ 605.566070] nvme_mpath_set_live+0x5e/0x130 [nvme_core] [ 605.567114] nvme_update_ns_ana_state+0x2c/0x30 [nvme_core] [ 605.568197] nvme_update_ana_state+0xca/0xe0 [nvme_core] [ 605.569360] nvme_parse_ana_log+0xa1/0x180 [nvme_core] [ 605.571385] nvme_read_ana_log+0x76/0x100 [nvme_core] [ 605.572376] nvme_ana_work+0x15/0x20 [nvme_core] [ 605.573330] process_one_work+0x1db/0x380 [ 605.574144] worker_thread+0x4d/0x400 [ 605.574896] kthread+0x104/0x140 [ 605.577205] ret_from_fork+0x35/0x40 [ 605.577955] INFO: task nvme:14044 blocked for more than 120 seconds. [ 605.579239] Tainted: G OE 5.3.5-050305-generic #201910071830 [ 605.580712] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 605.582320] nvme D 0 14044 14043 0x00000000 [ 605.583424] Call Trace: [ 605.583935] __schedule+0x2b9/0x6c0 [ 605.584625] schedule+0x42/0xb0 [ 605.585290] schedule_timeout+0x203/0x2f0 [ 605.588493] wait_for_completion+0xb1/0x120 [ 605.590066] __flush_work+0x123/0x1d0 [ 605.591758] __cancel_work_timer+0x10e/0x190 [ 605.593542] cancel_work_sync+0x10/0x20 [ 605.594347] nvme_mpath_stop+0x2f/0x40 [nvme_core] [ 605.595328] nvme_stop_ctrl+0x12/0x50 [nvme_core] [ 605.596262] nvme_do_delete_ctrl+0x3f/0x90 [nvme_core] [ 605.597333] nvme_sysfs_delete+0x5c/0x70 [nvme_core] [ 605.598320] dev_attr_store+0x17/0x30 Fix this by introducing a new state: NVME_CTRL_DELETE_NOIO, which will indicate the phase of controller deletion where I/O cannot be allowed to access the namespace. NVME_CTRL_DELETING still allows mpath I/O to be issued to the bottom device, and only after we flush the ana_work and scan_work (after nvme_stop_ctrl and nvme_prep_remove_namespaces) we change the state to NVME_CTRL_DELETING_NOIO. Also we prevent ana_work from re-firing by aborting early if we are not LIVE, so we should be safe here. In addition, change the transport drivers to follow the updated state machine. Fixes: 0d0b660f214d ("nvme: add ANA support") Reported-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 +++++++++++++++ drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/fabrics.h | 3 ++- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/multipath.c | 18 +++++++++++++++--- drivers/nvme/host/nvme.h | 6 ++++++ drivers/nvme/host/rdma.c | 10 ++++++---- drivers/nvme/host/tcp.c | 15 +++++++++------ 8 files changed, 55 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1d7c7afb1348..c16bfdff2953 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -366,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; case NVME_CTRL_DEAD: switch (old_state) { case NVME_CTRL_DELETING: @@ -403,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) case NVME_CTRL_CONNECTING: return false; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: case NVME_CTRL_DEAD: return true; default: @@ -3476,6 +3487,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", [NVME_CTRL_DEAD] = "dead", }; @@ -4112,6 +4124,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) if (ctrl->state == NVME_CTRL_DEAD) nvme_kill_queues(ctrl); + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + down_write(&ctrl->namespaces_rwsem); list_splice_init(&ctrl->namespaces, &ns_list); up_write(&ctrl->namespaces_rwsem); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2a6c8190eeb7..4ec4829d6233 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING && + if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a0ec40ab62ee..a9c1e3b4585e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_DELETING)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6aa30bb5a762..b27c54dc6683 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) break; case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: default: /* no action to take - let it delete */ break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74bad4e3d377..900b35d47ec7 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static bool nvme_path_is_disabled(struct nvme_ns *ns) { - return ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags); + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags)) + return true; + return false; } static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) @@ -563,6 +572,9 @@ static void nvme_ana_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + if (ctrl->state != NVME_CTRL_LIVE) + return; + nvme_read_ana_log(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 92629758b77c..1609267a1f0e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -190,6 +190,11 @@ static inline u16 nvme_req_qid(struct request *req) * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the * transport * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not + * disabled/failed immediately. This state comes + * after all async event processing took place and + * before ns removal and the controller deletion + * progress * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during * shutdown or removal. In this case we forcibly * kill all inflight I/O as they have no chance to @@ -201,6 +206,7 @@ enum nvme_ctrl_state { NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, NVME_CTRL_DEAD, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 467da08db309..5c3848974ccb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1102,11 +1102,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -1159,8 +1160,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); return; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b2e73e19ef01..8c8fb65ca928 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1950,11 +1950,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { /* - * state change failure is ok if we're in DELETING state, + * state change failure is ok if we started ctrl delete, * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -2010,8 +2011,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } @@ -2046,8 +2048,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work) nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); return; } -- cgit v1.2.3 From 653303f21682916dc3a70c24f68233b4392d52d6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 23 Jul 2020 18:14:10 -0700 Subject: nvme-hwmon: log the controller device name Stay consistent with the rest of the driver Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/hwmon.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 23ba8bf678ae..412a6c97c0d8 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl) err = nvme_hwmon_get_smart_log(data); if (err) { - dev_warn(dev, "Failed to read smart log (error %d)\n", err); + dev_warn(ctrl->device, + "Failed to read smart log (error %d)\n", err); devm_kfree(dev, data); return; } -- cgit v1.2.3 From 237480760c5050e8e897846b93ba9ffdb6444301 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 Jul 2020 12:03:36 -0700 Subject: nvme-fc: set max_segments to lldd max value Currently the FC transport is set max_hw_sectors based on the lldds max sgl segment count. However, the block queue max segments is set based on the controller's max_segments count, which the transport does not set. As such, the lldd is receiving sgl lists that are exceeding its max segment count. Set the controller max segment count and derive max_hw_sectors from the max segment count. Signed-off-by: James Smart Reviewed-by: Max Gurtovoy Reviewed-by: Himanshu Madhani Reviewed-by: Ewan D. Milne Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b27c54dc6683..eae43bb444e0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3002,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - ctrl->ctrl.max_hw_sectors = - (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; + ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << + (ilog2(SZ_4K) - 9); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); -- cgit v1.2.3 From 34efa23234c8b55dd178bba2216ca9ae9b50e1c9 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 24 Jul 2020 09:40:00 -0700 Subject: nvmet-fc: check successful reference in nvmet_fc_find_target_assoc When searching for an association based on an association id, when there is a match, the code takes a reference. However, it is not validating that the reference taking was successful. Check the status of the reference. If unsuccessful, the device is being deleted and should be ignored. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 27fd3b5aa621..c15356b5e09f 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1243,7 +1243,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { ret = assoc; - nvmet_fc_tgt_a_get(assoc); + if (!nvmet_fc_tgt_a_get(assoc)) + ret = NULL; break; } } -- cgit v1.2.3 From ece0278c1c96905c53a6cbb253927530fc707cfa Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 24 Jul 2020 09:40:48 -0700 Subject: nvmet-fc: remove redundant del_work_active flag The transport has a del_work_active flag to avoid duplicate scheduling of the del_work item. This is redundant with the checks that schedule_work() makes. Remove the del_work_active flag. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index c15356b5e09f..55bafd56166a 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc { struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; struct work_struct del_work; - atomic_t del_work_active; }; @@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work) container_of(work, struct nvmet_fc_tgt_assoc, del_work); nvmet_fc_delete_target_assoc(assoc); - atomic_set(&assoc->del_work_active, 0); nvmet_fc_tgt_a_put(assoc); } @@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); - atomic_set(&assoc->del_work_active, 0); atomic_set(&assoc->terminating, 0); while (needrandom) { @@ -1478,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) { struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, &tgtport->assoc_list, a_list) { if (!nvmet_fc_tgt_a_get(assoc)) continue; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); } @@ -1534,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, struct nvmet_fc_tgt_assoc *assoc, *next; unsigned long flags; bool noassoc = true; - int ret; spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry_safe(assoc, next, @@ -1546,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, continue; assoc->hostport->invalid = 1; noassoc = false; - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } } spin_unlock_irqrestore(&tgtport->lock, flags); @@ -1574,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) struct nvmet_fc_tgt_queue *queue; unsigned long flags; bool found_ctrl = false; - int ret; /* this is a bit ugly, but don't want to make locks layered */ spin_lock_irqsave(&nvmet_fc_tgtlock, flags); @@ -1598,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1); - if (ret == 0) { - if (!schedule_work(&assoc->del_work)) - nvmet_fc_tgt_a_put(assoc); - } else { + if (!schedule_work(&assoc->del_work)) /* already deleting - release local reference */ nvmet_fc_tgt_a_put(assoc); - } return; } -- cgit v1.2.3 From 2bf5d3bbffad06639db518c8dcf8a14c7dce770e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:12 -0600 Subject: nvme: clear any SGL flags in passthru commands The host driver should decide whether to use SGLs or PRPs and they currently assume the flags are cleared after the call to nvme_setup_cmd(). However, passed-through commands may erroneously set these bits; so clear them for all cases. Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c16bfdff2953..2818c17e92ed 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -604,6 +604,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } +static void nvme_setup_passthrough(struct request *req, + struct nvme_command *cmd) +{ + memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -769,7 +777,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); + nvme_setup_passthrough(req, cmd); break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); -- cgit v1.2.3 From df21b6b1934e89c2cc2bb1332146ed6c2df1321c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:13 -0600 Subject: nvme: create helper function to obtain command effects Separate the code to obtain command effects from the code to start a passthru request and move the nvme_passthru_start() and nvme_passthru_end() functions up above nvme_submit_user_cmd() in order that they may be used in a new helper a subsequent patch. The new helper function will be necessary for nvmet passthru code to determine if we need to change out of interrupt context to handle the effects. It is exported in the NVME_TARGET_PASSTHRU namespace. Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 192 ++++++++++++++++++++++++----------------------- drivers/nvme/host/nvme.h | 3 + 2 files changed, 103 insertions(+), 92 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2818c17e92ed..ab040eef83e7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -928,6 +928,106 @@ out: return ERR_PTR(ret); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (_nvme_revalidate_disk(ns->disk)) + nvme_set_queue_dying(ns); + else if (blk_queue_is_zoned(ns->disk->queue)) { + /* + * IO commands are required to fully revalidate a zoned + * device. Force the command effects to trigger rescan + * work so report zones can run in a context with + * unfrozen IO queues. + */ + *effects |= NVME_CMD_EFFECTS_NCC; + } + up_read(&ctrl->namespaces_rwsem); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl, &effects); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } +} + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -1394,98 +1494,6 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) -{ - u32 effects = 0; - - if (ns) { - if (ns->head->effects) - effects = le32_to_cpu(ns->head->effects->iocs[opcode]); - if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); - return 0; - } - - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); - - /* - * For simplicity, IO to all namespaces is quiesced even if the command - * effects say only one namespace is affected. - */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - mutex_lock(&ctrl->scan_lock); - mutex_lock(&ctrl->subsys->lock); - nvme_mpath_start_freeze(ctrl->subsys); - nvme_mpath_wait_freeze(ctrl->subsys); - nvme_start_freeze(ctrl); - nvme_wait_freeze(ctrl); - } - return effects; -} - -static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (_nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - else if (blk_queue_is_zoned(ns->disk->queue)) { - /* - * IO commands are required to fully revalidate a zoned - * device. Force the command effects to trigger rescan - * work so report zones can run in a context with - * unfrozen IO queues. - */ - *effects |= NVME_CMD_EFFECTS_NCC; - } - up_read(&ctrl->namespaces_rwsem); -} - -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) -{ - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl, &effects); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { - nvme_unfreeze(ctrl); - nvme_mpath_unfreeze(ctrl->subsys); - mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); - mutex_unlock(&ctrl->scan_lock); - } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { - nvme_queue_scan(ctrl); - flush_work(&ctrl->scan_work); - } -} - static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1609267a1f0e..25063f041f8d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -790,4 +790,7 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl); static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } #endif +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); + #endif /* _NVME_H */ -- cgit v1.2.3 From 17365ae6975c7f7494a2d1cd0bf18b5ed238e072 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:14 -0600 Subject: nvme: introduce nvme_execute_passthru_rq to call nvme_passthru_[start|end]() Introduce a new nvme_execute_passthru_rq() helper which calls nvme_passthru_[start|end]() around blk_execute_rq(). This ensures all passthru calls (including nvme_submit_io()) will be wrapped appropriately. nvme_execute_passthru_rq() will also be useful for the nvmet passthru code and is exported in the NVME_TARGET_PASSTHRU namespace. Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 22 +++++++++++++++------- drivers/nvme/host/nvme.h | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ab040eef83e7..8296c1248f87 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1028,6 +1028,20 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) } } +void nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + blk_execute_rq(rq->q, disk, rq, 0); + nvme_passthru_end(ctrl, effects); +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, @@ -1066,7 +1080,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } - blk_execute_rq(req->q, disk, req, 0); + nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else @@ -1500,7 +1514,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; u64 result; int status; @@ -1527,12 +1540,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -1548,7 +1559,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; - u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1574,12 +1584,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); - nvme_passthru_end(ctrl, effects); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 25063f041f8d..4e3bc4b66c57 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -792,5 +792,6 @@ static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); +void nvme_execute_passthru_rq(struct request *rq); #endif /* _NVME_H */ -- cgit v1.2.3 From f783f444ceaa7eab043a7a5ae0e34c68743c0358 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:15 -0600 Subject: nvme: introduce nvme_ctrl_get_by_path() nvme_ctrl_get_by_path() is analogous to blkdev_get_by_path() except it gets a struct nvme_ctrl from the path to its char dev (/dev/nvme0). It makes use of filp_open() to open the file and uses the private data to obtain a pointer to the struct nvme_ctrl. If the fops of the file do not match, -EINVAL is returned. The purpose of this function is to support NVMe-OF target passthru and is exported under the NVME_TARGET_PASSTHRU namespace. Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 23 +++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 24 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8296c1248f87..c44316c5018f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4590,6 +4590,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +{ + struct nvme_ctrl *ctrl; + struct file *f; + + f = filp_open(path, O_RDWR, 0); + if (IS_ERR(f)) + return ERR_CAST(f); + + if (f->f_op != &nvme_dev_fops) { + ctrl = ERR_PTR(-EINVAL); + goto out_close; + } + + ctrl = f->private_data; + nvme_get_ctrl(ctrl); + +out_close: + filp_close(f, NULL); + return ctrl; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); + /* * Check we didn't inadvertently grow the command structure sizes: */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4e3bc4b66c57..5af59a9bac53 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -793,5 +793,6 @@ static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { } u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); void nvme_execute_passthru_rq(struct request *rq); +struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); #endif /* _NVME_H */ -- cgit v1.2.3 From 24493b8b854a67234c5c142a7ea075e2174a3084 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:16 -0600 Subject: nvme: export nvme_find_get_ns() and nvme_put_ns() nvme_find_get_ns() and nvme_put_ns() are required by the target passthru code and are exported under the NVME_TARGET_PASSTHRU namespace. Based-on-a-patch-by: Chaitanya Kulkarni Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++++-- drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c44316c5018f..05aa568a60af 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -465,10 +465,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static void nvme_put_ns(struct nvme_ns *ns) +void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { @@ -3827,7 +3828,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) return nsa->head->ns_id - nsb->head->ns_id; } -static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3845,6 +3846,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) up_read(&ctrl->namespaces_rwsem); return ret; } +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5af59a9bac53..c5c1bac797aa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -794,5 +794,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); void nvme_execute_passthru_rq(struct request *rq); struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +void nvme_put_ns(struct nvme_ns *ns); #endif /* _NVME_H */ -- cgit v1.2.3 From c1fef73f793b7fd9d2ffcb5ef85807ea55bf7adb Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:17 -0600 Subject: nvmet: add passthru code to process commands Add passthru command handling capability for the NVMeOF target and export passthru APIs which are used to integrate passthru code with nvmet-core. The new file passthru.c handles passthru cmd parsing and execution. In the passthru mode, we create a block layer request from the nvmet request and map the data on to the block layer request. Admin commands and features are on an allow list as there are a number of each that don't make too much sense with passthrough. We use an allow list such that new commands can be considered before being blindly passed through. In both cases, vendor specific commands are always allowed. We also reject reservation IO commands as the underlying device cannot differentiate between multiple hosts behind a fabric. Based-on-a-patch-by: Chaitanya Kulkarni Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Makefile | 1 + drivers/nvme/target/admin-cmd.c | 7 +- drivers/nvme/target/core.c | 3 + drivers/nvme/target/nvmet.h | 39 ++++ drivers/nvme/target/passthru.c | 458 ++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 4 + 6 files changed, 510 insertions(+), 2 deletions(-) create mode 100644 drivers/nvme/target/passthru.c (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 2b33836f3d3e..ebf91fc4c72e 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o +nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 55918fcef80b..e9fe91786bbb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -749,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) return 0; } -static void nvmet_execute_set_features(struct nvmet_req *req) +void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -824,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req) nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); } -static void nvmet_execute_get_features(struct nvmet_req *req) +void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); @@ -940,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_admin_cmd(req); + switch (cmd->common.opcode) { case nvme_admin_get_log_page: req->execute = nvmet_execute_get_log_page; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 59621b816f6e..c5a1c82e699b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -849,6 +849,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + if (nvmet_req_passthru_ctrl(req)) + return nvmet_parse_passthru_io_cmd(req); + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); if (unlikely(!req->ns)) { req->error_loc = offsetof(struct nvme_common_command, nsid); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 49e14111446c..51719dc62083 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -242,6 +242,10 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; struct nvmet_subsys_model __rcu *model; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + struct nvme_ctrl *passthru_ctrl; +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) @@ -321,6 +325,11 @@ struct nvmet_req { struct bio_vec *bvec; struct work_struct work; } f; + struct { + struct request *rq; + struct work_struct work; + bool use_workqueue; + } p; }; int sg_cnt; int metadata_sg_cnt; @@ -400,6 +409,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgls(struct nvmet_req *req); void nvmet_req_free_sgls(struct nvmet_req *req); +void nvmet_execute_set_features(struct nvmet_req *req); +void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, @@ -532,6 +543,34 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req) sizeof(struct nvme_dsm_range); } +#ifdef CONFIG_NVME_TARGET_PASSTHRU +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return subsys->passthru_ctrl; +} +#else /* CONFIG_NVME_TARGET_PASSTHRU */ +static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +{ + return NULL; +} +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +static inline struct nvme_ctrl * +nvmet_req_passthru_ctrl(struct nvmet_req *req) +{ + return nvmet_passthru_ctrl(req->sq->ctrl->subsys); +} + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); /* Convert a 32-bit number to a 16-bit 0's based number */ diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c new file mode 100644 index 000000000000..b9727553ab30 --- /dev/null +++ b/drivers/nvme/target/passthru.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target Passthrough command implementation. + * + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + * Copyright (c) 2019-2020, Eideticom Inc. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + +#include "../host/nvme.h" +#include "nvmet.h" + +MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); + +static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + int page_shift; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id)); + if (status) + goto out_free; + + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* + * The passthru NVMe driver may have a limit on the number of segments + * which depends on the host's memory fragementation. To solve this, + * ensure mdts is limited to the pages equal to the number of segments. + */ + max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), + pctrl->max_hw_sectors); + + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + id->mdts = ilog2(max_hw_sectors) + 9 - page_shift; + + id->acl = 3; + /* + * We export aerl limit for the fabrics controller, update this when + * passthru based aerl support is added. + */ + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* emulate kas as most of the PCIe ctrl don't have a support for kas */ + id->kas = cpu_to_le16(NVMET_KAS); + + /* don't support host memory buffer */ + id->hmpre = 0; + id->hmmin = 0; + + id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); + id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + /* don't support fuse commands */ + id->fuses = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (req->port->inline_data_size) + id->sgls |= cpu_to_le32(1 << 20); + + /* + * When passsthru controller is setup using nvme-loop transport it will + * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in + * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() + * code path with duplicate ctr subsynqn. In order to prevent that we + * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. + */ + memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); + + /* use fabric id-ctrl values */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + req->port->inline_data_size) / 16); + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* Support multipath connections with fabrics */ + id->cmic |= 1 << 1; + + /* Disable reservations, see nvmet_parse_passthru_io_cmd() */ + id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl)); + +out_free: + kfree(id); + return status; +} + +static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ns *id; + int i; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns)); + if (status) + goto out_free; + + for (i = 0; i < (id->nlbaf + 1); i++) + if (id->lbaf[i].ms) + memset(&id->lbaf[i], 0, sizeof(id->lbaf[i])); + + id->flbas = id->flbas & ~(1 << 4); + + /* + * Presently the NVMEof target code does not support sending + * metadata, so we must disable it here. This should be updated + * once target starts supporting metadata. + */ + id->mc = 0; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + +out_free: + kfree(id); + return status; +} + +static void nvmet_passthru_execute_cmd_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); + struct request *rq = req->p.rq; + u16 status; + + nvme_execute_passthru_rq(rq); + + status = nvme_req(rq)->status; + if (status == NVME_SC_SUCCESS && + req->cmd->common.opcode == nvme_admin_identify) { + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + nvmet_passthru_override_id_ctrl(req); + break; + case NVME_ID_CNS_NS: + nvmet_passthru_override_id_ns(req); + break; + } + } + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +static void nvmet_passthru_req_done(struct request *rq, + blk_status_t blk_status) +{ + struct nvmet_req *req = rq->end_io_data; + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, nvme_req(rq)->status); + blk_put_request(rq); +} + +static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + int op_flags = 0; + struct bio *bio; + int i, ret; + + if (req->cmd->common.opcode == nvme_cmd_flush) + op_flags = REQ_FUA; + else if (nvme_is_write(req->cmd)) + op_flags = REQ_SYNC | REQ_IDLE; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + bio->bi_opf = req_op(rq) | op_flags; + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, + sg->offset) < sg->length) { + bio_put(bio); + return -EINVAL; + } + sg_cnt--; + } + + ret = blk_rq_append_bio(rq, &bio); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + return 0; +} + +static void nvmet_passthru_execute_cmd(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct request_queue *q = ctrl->admin_q; + struct nvme_ns *ns = NULL; + struct request *rq = NULL; + u32 effects; + u16 status; + int ret; + + if (likely(req->sq->qid != 0)) { + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + + ns = nvme_find_get_ns(ctrl, nsid); + if (unlikely(!ns)) { + pr_err("failed to get passthru ns nsid:%u\n", nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto fail_out; + } + + q = ns->queue; + } + + rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + if (IS_ERR(rq)) { + rq = NULL; + status = NVME_SC_INTERNAL; + goto fail_out; + } + + if (req->sg_cnt) { + ret = nvmet_passthru_map_sg(req, rq); + if (unlikely(ret)) { + status = NVME_SC_INTERNAL; + goto fail_out; + } + } + + /* + * If there are effects for the command we are about to execute, or + * an end_req function we need to use nvme_execute_passthru_rq() + * synchronously in a work item seeing the end_req function and + * nvme_passthru_end() can't be called in the request done callback + * which is typically in interrupt context. + */ + effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); + if (req->p.use_workqueue || effects) { + INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); + req->p.rq = rq; + schedule_work(&req->p.work); + } else { + rq->end_io_data = req; + blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, + nvmet_passthru_req_done); + } + + if (ns) + nvme_put_ns(ns); + + return; + +fail_out: + if (ns) + nvme_put_ns(ns); + nvmet_req_complete(req, status); + blk_put_request(rq); +} + +/* + * We need to emulate set host behaviour to ensure that any requested + * behaviour of the target's host matches the requested behaviour + * of the device's host and fail otherwise. + */ +static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_feat_host_behavior *host; + u16 status = NVME_SC_INTERNAL; + int ret; + + host = kzalloc(sizeof(*host) * 2, GFP_KERNEL); + if (!host) + goto out_complete_req; + + ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + if (ret) + goto out_free_host; + + status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host)); + if (status) + goto out_free_host; + + if (memcmp(&host[0], &host[1], sizeof(host[0]))) { + pr_warn("target host has requested different behaviour from the local host\n"); + status = NVME_SC_INTERNAL; + } + +out_free_host: + kfree(host); +out_complete_req: + nvmet_req_complete(req, status); +} + +static u16 nvmet_setup_passthru_command(struct nvmet_req *req) +{ + req->p.use_workqueue = false; + req->execute = nvmet_passthru_execute_cmd; + return NVME_SC_SUCCESS; +} + +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + switch (req->cmd->common.opcode) { + case nvme_cmd_resv_register: + case nvme_cmd_resv_report: + case nvme_cmd_resv_acquire: + case nvme_cmd_resv_release: + /* + * Reservations cannot be supported properly because the + * underlying device has no way of differentiating different + * hosts that connect via fabrics. This could potentially be + * emulated in the future if regular targets grow support for + * this feature. + */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return nvmet_setup_passthru_command(req); +} + +/* + * Only features that are emulated or specifically allowed in the list are + * passed down to the controller. This function implements the allow list for + * both get and set features. + */ +static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_LBA_RANGE: + case NVME_FEAT_TEMP_THRESH: + case NVME_FEAT_ERR_RECOVERY: + case NVME_FEAT_VOLATILE_WC: + case NVME_FEAT_WRITE_ATOMIC: + case NVME_FEAT_AUTO_PST: + case NVME_FEAT_TIMESTAMP: + case NVME_FEAT_HCTM: + case NVME_FEAT_NOPSC: + case NVME_FEAT_RRL: + case NVME_FEAT_PLM_CONFIG: + case NVME_FEAT_PLM_WINDOW: + case NVME_FEAT_HOST_BEHAVIOR: + case NVME_FEAT_SANITIZE: + case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END: + return nvmet_setup_passthru_command(req); + + case NVME_FEAT_ASYNC_EVENT: + /* There is no support for forwarding ASYNC events */ + case NVME_FEAT_IRQ_COALESCE: + case NVME_FEAT_IRQ_CONFIG: + /* The IRQ settings will not apply to the target controller */ + case NVME_FEAT_HOST_MEM_BUF: + /* + * Any HMB that's set will not be passed through and will + * not work as expected + */ + case NVME_FEAT_SW_PROGRESS: + /* + * The Pre-Boot Software Load Count doesn't make much + * sense for a target to export + */ + case NVME_FEAT_RESV_MASK: + case NVME_FEAT_RESV_PERSIST: + /* No reservations, see nvmet_parse_passthru_io_cmd() */ + default: + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + /* + * Passthru all vendor specific commands + */ + if (req->cmd->common.opcode >= nvme_admin_vendor_start) + return nvmet_setup_passthru_command(req); + + switch (req->cmd->common.opcode) { + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return NVME_SC_SUCCESS; + case nvme_admin_keep_alive: + /* + * Most PCIe ctrls don't support keep alive cmd, we route keep + * alive to the non-passthru mode. In future please change this + * code when PCIe ctrls with keep alive support available. + */ + req->execute = nvmet_execute_keep_alive; + return NVME_SC_SUCCESS; + case nvme_admin_set_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_set_features; + return NVME_SC_SUCCESS; + case NVME_FEAT_HOST_BEHAVIOR: + req->execute = nvmet_passthru_set_host_behaviour; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_get_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_get_features; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_identify: + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + case NVME_ID_CNS_NS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + default: + return nvmet_setup_passthru_command(req); + } + case nvme_admin_get_log_page: + return nvmet_setup_passthru_command(req); + default: + /* Reject commands not in the allowlist above */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1643005d21e3..d92535997687 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -312,6 +312,7 @@ enum { NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_RESERVATIONS = 1 << 5, NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, @@ -982,6 +983,7 @@ enum nvme_admin_opcode { nvme_admin_security_recv = 0x82, nvme_admin_sanitize_nvm = 0x84, nvme_admin_get_lba_status = 0x86, + nvme_admin_vendor_start = 0xC0, }; #define nvme_admin_opcode_name(opcode) { opcode, #opcode } @@ -1045,6 +1047,8 @@ enum { NVME_FEAT_RESV_MASK = 0x82, NVME_FEAT_RESV_PERSIST = 0x83, NVME_FEAT_WRITE_PROTECT = 0x84, + NVME_FEAT_VENDOR_START = 0xC0, + NVME_FEAT_VENDOR_END = 0xFF, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, -- cgit v1.2.3 From ba76af676cd0514614b5a99b8adad9d3956f5d7d Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:18 -0600 Subject: nvmet: Add passthru enable/disable helpers This patch adds helper functions which are used in the NVMeOF configfs when the user is configuring the passthru subsystem. Here we ensure that only one subsys is assigned to each nvme_ctrl by using an xarray on the cntlid. The subsystem's version number is overridden by the passed through controller's version. However, if that version is less than 1.2.1, then we bump the advertised version to that and print a warning in dmesg. Based-on-a-patch-by: Chaitanya Kulkarni Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 4 ++ drivers/nvme/target/core.c | 10 ++++- drivers/nvme/target/nvmet.h | 12 ++++++ drivers/nvme/target/passthru.c | 86 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index cdec47de89ed..61c258dea88a 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -879,6 +879,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; + /* passthru subsystems use the underlying controller's version */ + if (nvmet_passthru_ctrl(subsys)) + return -EINVAL; + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index c5a1c82e699b..b92f45f5cd5b 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -544,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns) mutex_lock(&subsys->lock); ret = 0; + + if (nvmet_passthru_ctrl(subsys)) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + if (ns->enabled) goto out_unlock; @@ -1473,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return ERR_PTR(-ENOMEM); - subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ + subsys->ver = NVMET_DEFAULT_VS; /* generate a random serial number as our controllers are ephemeral: */ get_random_bytes(&subsys->serial, sizeof(subsys->serial)); @@ -1516,6 +1522,8 @@ static void nvmet_subsys_free(struct kref *ref) WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); xa_destroy(&subsys->namespaces); + nvmet_passthru_subsys_free(subsys); + kfree(subsys->subsysnqn); kfree_rcu(subsys->model, rcuhead); kfree(subsys); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 51719dc62083..9d0aa6560c0c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -21,6 +21,8 @@ #include #include +#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) + #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) @@ -245,6 +247,7 @@ struct nvmet_subsys { #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; + char *passthru_ctrl_path; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; @@ -544,6 +547,9 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req) } #ifdef CONFIG_NVME_TARGET_PASSTHRU +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) @@ -551,6 +557,12 @@ static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) return subsys->passthru_ctrl; } #else /* CONFIG_NVME_TARGET_PASSTHRU */ +static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ +} +static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ +} static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) { return 0; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b9727553ab30..89d91dc999a6 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -15,6 +15,11 @@ MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); +/* + * xarray to maintain one passthru subsystem per nvme controller. + */ +static DEFINE_XARRAY(passthru_subsystems); + static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -456,3 +461,84 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } + +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) +{ + struct nvme_ctrl *ctrl; + int ret = -EINVAL; + void *old; + + mutex_lock(&subsys->lock); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + if (subsys->passthru_ctrl) + goto out_unlock; + + if (subsys->nr_namespaces) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + + ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + pr_err("failed to open nvme controller %s\n", + subsys->passthru_ctrl_path); + + goto out_unlock; + } + + old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, + subsys, GFP_KERNEL); + if (xa_is_err(old)) { + ret = xa_err(old); + goto out_put_ctrl; + } + + if (old) + goto out_put_ctrl; + + subsys->passthru_ctrl = ctrl; + subsys->ver = ctrl->vs; + + if (subsys->ver < NVME_VS(1, 2, 1)) { + pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n", + NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + subsys->ver = NVME_VS(1, 2, 1); + } + + mutex_unlock(&subsys->lock); + return 0; + +out_put_ctrl: + nvme_put_ctrl(ctrl); +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + if (subsys->passthru_ctrl) { + xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid); + nvme_put_ctrl(subsys->passthru_ctrl); + } + subsys->passthru_ctrl = NULL; + subsys->ver = NVMET_DEFAULT_VS; +} + +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); +} + +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); + kfree(subsys->passthru_ctrl_path); +} -- cgit v1.2.3 From cae5b01a2afc2058bee94b4b8f873d3a4a9ec41e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 24 Jul 2020 11:25:19 -0600 Subject: nvmet: introduce the passthru configfs interface When CONFIG_NVME_TARGET_PASSTHRU as 'passthru' directory will be added to each subsystem. The directory is similar to a namespace and has two attributes: device_path and enable. The user must set the path to the nvme controller's char device and write '1' to enable the subsystem to use passthru. Any given subsystem is prevented from enabling both a regular namespace and the passthru device. If one is enabled, enabling the other will produce an error. Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 99 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + 2 files changed, 100 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 61c258dea88a..74b2b61c773b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = { .ct_owner = THIS_MODULE, }; +#ifdef CONFIG_NVME_TARGET_PASSTHRU + +static ssize_t nvmet_passthru_device_path_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path); +} + +static ssize_t nvmet_passthru_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + size_t len; + int ret; + + mutex_lock(&subsys->lock); + + ret = -EBUSY; + if (subsys->passthru_ctrl) + goto out_unlock; + + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + + kfree(subsys->passthru_ctrl_path); + ret = -ENOMEM; + subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + + return count; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} +CONFIGFS_ATTR(nvmet_passthru_, device_path); + +static ssize_t nvmet_passthru_enable_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0); +} + +static ssize_t nvmet_passthru_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_passthru_ctrl_enable(subsys); + else + nvmet_passthru_ctrl_disable(subsys); + + return ret ? ret : count; +} +CONFIGFS_ATTR(nvmet_passthru_, enable); + +static struct configfs_attribute *nvmet_passthru_attrs[] = { + &nvmet_passthru_attr_device_path, + &nvmet_passthru_attr_enable, + NULL, +}; + +static const struct config_item_type nvmet_passthru_type = { + .ct_attrs = nvmet_passthru_attrs, + .ct_owner = THIS_MODULE, +}; + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ + config_group_init_type_name(&subsys->passthru_group, + "passthru", &nvmet_passthru_type); + configfs_add_default_group(&subsys->passthru_group, + &subsys->group); +} + +#else /* CONFIG_NVME_TARGET_PASSTHRU */ + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ +} + +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + static int nvmet_port_subsys_allow_link(struct config_item *parent, struct config_item *target) { @@ -1125,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group, configfs_add_default_group(&subsys->allowed_hosts_group, &subsys->group); + nvmet_add_passthru_group(subsys); + return &subsys->group; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 9d0aa6560c0c..47ee3fb193bd 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -248,6 +248,7 @@ struct nvmet_subsys { #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; char *passthru_ctrl_path; + struct config_group passthru_group; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; -- cgit v1.2.3 From d9174c1a5da02cf7477b2aae0bcc1a236d3c0262 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 24 Jul 2020 11:25:20 -0600 Subject: nvmet: introduce the passthru Kconfig option This patch updates KConfig file for the NVMeOF target where we add new option so that user can selectively enable/disable passthru code. Signed-off-by: Chaitanya Kulkarni [logang@deltatee.com: fixed some of the wording in the help message] Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4474952d64c6..8056955e652c 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -16,6 +16,18 @@ config NVME_TARGET To configure the NVMe target you probably want to use the nvmetcli tool from http://git.infradead.org/users/hch/nvmetcli.git. +config NVME_TARGET_PASSTHRU + bool "NVMe Target Passthrough support" + depends on NVME_TARGET + depends on NVME_CORE=y || NVME_CORE=NVME_TARGET + help + This enables target side NVMe passthru controller support for the + NVMe Over Fabrics protocol. It allows for hosts to manage and + directly access an actual NVMe controller residing on the target + side, incuding executing Vendor Unique Commands. + + If unsure, say N. + config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET -- cgit v1.2.3 From 2875b0aecabe2f081a8432e2bc85b85df0529490 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 24 Jul 2020 15:10:12 -0700 Subject: nvme-tcp: fix controller reset hang during traffic commit fe35ec58f0d3 ("block: update hctx map when use multiple maps") exposed an issue where we may hang trying to wait for queue freeze during I/O. We call blk_mq_update_nr_hw_queues which in case of multiple queue maps (which we have now for default/read/poll) is attempting to freeze the queue. However we never started queue freeze when starting the reset, which means that we have inflight pending requests that entered the queue that we will not complete once the queue is quiesced. So start a freeze before we quiesce the queue, and unfreeze the queue after we successfully connected the I/O queues (and make sure to call blk_mq_update_nr_hw_queues only after we are sure that the queue was already frozen). This follows to how the pci driver handles resets. Fixes: fe35ec58f0d3 ("block: update hctx map when use multiple maps") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8c8fb65ca928..378c049e0a5e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1771,15 +1771,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(ctrl->tagset, - ctrl->queue_count - 1); } ret = nvme_tcp_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(ctrl); + nvme_wait_freeze(ctrl); + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + return 0; out_cleanup_connect_q: @@ -1884,6 +1889,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; + nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); if (ctrl->tagset) { -- cgit v1.2.3 From 9f98772ba307dd89a3d17dc2589f213d3972fc64 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 Jul 2020 17:32:09 -0700 Subject: nvme-rdma: fix controller reset hang during traffic commit fe35ec58f0d3 ("block: update hctx map when use multiple maps") exposed an issue where we may hang trying to wait for queue freeze during I/O. We call blk_mq_update_nr_hw_queues which in case of multiple queue maps (which we have now for default/read/poll) is attempting to freeze the queue. However we never started queue freeze when starting the reset, which means that we have inflight pending requests that entered the queue that we will not complete once the queue is quiesced. So start a freeze before we quiesce the queue, and unfreeze the queue after we successfully connected the I/O queues (and make sure to call blk_mq_update_nr_hw_queues only after we are sure that the queue was already frozen). This follows to how the pci driver handles resets. Fixes: fe35ec58f0d3 ("block: update hctx map when use multiple maps") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5c3848974ccb..44c76ffbb264 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -967,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ret = PTR_ERR(ctrl->ctrl.connect_q); goto out_free_tag_set; } - } else { - blk_mq_update_nr_hw_queues(&ctrl->tag_set, - ctrl->ctrl.queue_count - 1); } ret = nvme_rdma_start_io_queues(ctrl); if (ret) goto out_cleanup_connect_q; + if (!new) { + nvme_start_queues(&ctrl->ctrl); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + return 0; out_cleanup_connect_q: @@ -1008,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); if (ctrl->ctrl.tagset) { -- cgit v1.2.3 From 3f6e3246db0e6f92e784965d9d0edb8abe6c6b74 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 27 Jul 2020 18:08:02 +0200 Subject: nvme-multipath: fix logic for non-optimized paths Handle the special case where we have exactly one optimized path, which we should keep using in this case. Fixes: 75c10e732724 ("nvme-multipath: round-robin I/O policy") Signed off-by: Martin Wilck Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 900b35d47ec7..93c70e1591de 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -255,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } + /* No optimized path found, re-check the current path */ + if (!nvme_path_is_disabled(old) && + old->ana_state == NVME_ANA_OPTIMIZED) { + found = old; + goto out; + } if (!fallback) return NULL; found = fallback; -- cgit v1.2.3 From fbd6a42d8932e172921c7de10468a2e12c34846b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jul 2020 18:08:03 +0200 Subject: nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths When nvme_round_robin_path() finds a valid namespace we should be using it; falling back to __nvme_find_path() for non-optimized paths will cause the result from nvme_round_robin_path() to be ignored for non-optimized paths. Fixes: 75c10e732724 ("nvme-multipath: round-robin I/O policy") Signed-off-by: Martin Wilck Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 93c70e1591de..3ded54d2c9c6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -281,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) struct nvme_ns *ns; ns = srcu_dereference(head->current_path[node], &head->srcu); - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns) - ns = nvme_round_robin_path(head, node, ns); - if (unlikely(!ns || !nvme_path_is_optimized(ns))) - ns = __nvme_find_path(head, node); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); return ns; } -- cgit v1.2.3 From 64d452b3560b7a55277c8d9ef0a8635e62136580 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 28 Jul 2020 19:36:47 -0700 Subject: nvme-loop: set ctrl state connecting after init When creating a loop controller (ctrl) in nvme_loop_create_ctrl() -> nvme_init_ctrl() we set the ctrl state to NVME_CTRL_NEW. Prior to [1] NVME_CTRL_NEW state was allowed in nvmf_check_ready() for fabrics command type connect. Now, this fails in the following code path for fabrics connect command when creating admin queue :- nvme_loop_create_ctrl() nvme_loo_configure_admin_queue() nvmf_connect_admin_queue() __nvme_submit_sync_cmd() blk_execute_rq() nvme_loop_queue_rq() nvmf_check_ready() # echo "transport=loop,nqn=fs" > /dev/nvme-fabrics [ 6047.741327] nvmet: adding nsid 1 to subsystem fs [ 6048.756430] nvme nvme1: Connect command failed, error wo/DNR bit: 880 We need to set the ctrl state to NVME_CTRL_CONNECTING after :- nvme_loop_create_ctrl() nvme_init_ctrl() so that the above mentioned check for nvmf_check_ready() will return true. This patch sets the ctrl state to connecting after we init the ctrl in nvme_loop_create_ctrl() nvme_init_ctrl() . [1] commit aa63fa6776a7 ("nvme-fabrics: allow to queue requests for live queues") Fixes: aa63fa6776a7 ("nvme-fabrics: allow to queue requests for live queues") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Tested-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f2c80a51985f..14d820c9a276 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -583,6 +583,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, if (ret) goto out_put_ctrl; + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); + WARN_ON_ONCE(!changed); + ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; -- cgit v1.2.3 From b6cec06d19d90db5dbcc50034fb33983f6259b8b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 28 Jul 2020 19:36:48 -0700 Subject: nvme-loop: remove extra variable in create ctrl We can call the nvme_change_ctrl_state() directly and have WARN_ON_ONCE(1) call instead of having to use an extra variable which matches the name of the function. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 14d820c9a276..4884ef1e46a2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -444,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) { struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); - bool changed; int ret; nvme_stop_ctrl(&ctrl->ctrl); @@ -471,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) blk_mq_update_nr_hw_queues(&ctrl->tag_set, ctrl->ctrl.queue_count - 1); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); nvme_start_ctrl(&ctrl->ctrl); @@ -567,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_loop_ctrl *ctrl; - bool changed; int ret; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); @@ -583,8 +581,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, if (ret) goto out_put_ctrl; - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + WARN_ON_ONCE(1); ret = -ENOMEM; @@ -620,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); mutex_lock(&nvme_loop_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); -- cgit v1.2.3