From e9ef46369f5107e634a93b7fc4e62a1f53343197 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 24 Jul 2012 15:01:04 -0600 Subject: [PATCH 01/12] NVMe: Set request queue logical block size Sets the request queue logical block size with the block size of the namespace. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index f4996b0e4b1a..38b9c73f6706 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1344,6 +1344,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, ns->disk = disk; lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); disk->major = nvme_major; disk->minors = NVME_MINORS; From 5c42ea1643a630060f9e71e06d3933d244970967 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:05:18 -0600 Subject: [PATCH 02/12] NVMe: Fix nvme module init when nvme_major is set register_blkdev returns 0 when given a valid major number. Reported-by:Ross Zwisler Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 38b9c73f6706..46e33eec6298 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1716,9 +1716,11 @@ static int __init nvme_init(void) if (IS_ERR(nvme_thread)) return PTR_ERR(nvme_thread); - nvme_major = register_blkdev(nvme_major, "nvme"); - if (nvme_major <= 0) + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) goto kill_kthread; + else if (result > 0) + nvme_major = result; result = pci_register_driver(&nvme_driver); if (result) From 50af8baec46a99a9b81a4600c0374f83a5a590a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:07:55 -0600 Subject: [PATCH 03/12] NVMe: replace nvme_ns with nvme_dev for user admin The function nvme_user_admin_command does not require a namespace to proceed. Replace with the nvme_dev structure so that it can be called from contexts that do not have a namespace. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 46e33eec6298..89935853cb85 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1151,10 +1151,9 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_ns *ns, +static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_admin_cmd __user *ucmd) { - struct nvme_dev *dev = ns->dev; struct nvme_admin_cmd cmd; struct nvme_command c; int status, length; @@ -1209,7 +1208,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, case NVME_IOCTL_ID: return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns, (void __user *)arg); + return nvme_user_admin_cmd(ns->dev, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); default: From a42ceccef0c43b46ff6bc1b12a7c1076ef243df1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 25 Jul 2012 16:06:38 -0600 Subject: [PATCH 04/12] NVMe: use namespace id for nvme_get_features The specification does not provide a use for command dword11 in the NVMe Get Features command, but does use the NSID for some features. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 89935853cb85..7bcd88205a41 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -840,15 +840,15 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, } static int nvme_get_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr) + unsigned nsid, dma_addr_t dma_addr) { struct nvme_command c; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); return nvme_submit_admin_cmd(dev, &c, NULL); } From 8fc23e032debd682f5ba9fc524a5846c10d2c522 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Jul 2012 11:29:57 -0600 Subject: [PATCH 05/12] NVMe: Set block queue max sectors Set the max hw sectors in a namespace's request queue if the nvme device has a max data transfer size. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 7 +++++++ include/linux/nvme.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 7bcd88205a41..11951fa11a90 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -78,6 +78,7 @@ struct nvme_dev { char serial[20]; char model[40]; char firmware_rev[8]; + u32 max_hw_sectors; }; /* @@ -1344,6 +1345,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, lbaf = id->flbas & 0xf; ns->lba_shift = id->lbaf[lbaf].ds; blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (dev->max_hw_sectors) + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); disk->major = nvme_major; disk->minors = NVME_MINORS; @@ -1485,6 +1488,10 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev) memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + if (ctrl->mdts) { + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); + } id_ns = mem; for (i = 1; i <= nn; i++) { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9490a00529f4..8c71d2004c6d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -37,6 +37,7 @@ struct nvme_bar { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) enum { NVME_CC_ENABLE = 1 << 0, From a0cadb85b8b758608ae0759151e29de7581c6731 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jul 2012 13:57:23 -0400 Subject: [PATCH 06/12] NVMe: Do not set IO queue depth beyond device max Set the depth for IO queues to the device's maximum supported queue entries if the requested depth exceeds the device's capabilities. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 10 ++++++---- include/linux/nvme.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 11951fa11a90..af1ef39bd6b4 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -893,7 +893,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info)); + unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * + sizeof(struct nvme_cmd_info)); struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); if (!nvmeq) return NULL; @@ -1391,7 +1392,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) { - int result, cpu, i, nr_io_queues, db_bar_size; + int result, cpu, i, nr_io_queues, db_bar_size, q_depth; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1437,9 +1438,10 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) cpu = cpumask_next(cpu, cpu_online_mask); } + q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, + NVME_Q_DEPTH); for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, - NVME_Q_DEPTH, i); + dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); if (IS_ERR(dev->queues[i + 1])) return PTR_ERR(dev->queues[i + 1]); dev->queue_count++; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8c71d2004c6d..c25cccaa555a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -35,6 +35,7 @@ struct nvme_bar { __u64 acq; /* Admin CQ Base Address */ }; +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) From c7d36ab8fa04c213328119a9c0d66985fe204ee5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jul 2012 11:53:28 -0600 Subject: [PATCH 07/12] NVMe: Fix uninitialized iod compiler warning Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index af1ef39bd6b4..6c0eb768562f 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1159,7 +1159,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, struct nvme_admin_cmd cmd; struct nvme_command c; int status, length; - struct nvme_iod *iod; + struct nvme_iod *uninitialized_var(iod); if (!capable(CAP_SYS_ADMIN)) return -EACCES; From 22fff826e715e9727d3c7a69f15e602a9801b673 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 20 Jan 2012 07:55:30 -0500 Subject: [PATCH 08/12] NVMe: handle allocation failure in nvme_map_user_pages() We should return here and avoid a NULL dereference. Signed-off-by: Dan Carpenter Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 6c0eb768562f..064e86a6bb4e 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1044,6 +1044,8 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, offset = offset_in_page(addr); count = DIV_ROUND_UP(offset + length, PAGE_SIZE); pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); err = get_user_pages_fast(addr, count, 1, pages); if (err < count) { From 0ac13140d796eb1e2f8956aea97a6e5e4ebcf981 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 31 Jul 2012 13:31:15 -0400 Subject: [PATCH 09/12] NVMe: Fix whitespace damage in nvme_init Commit 5c42ea1643 used spaces instead of tabs. Also remove the unnecessary initialisation of the 'result' variable. Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 064e86a6bb4e..0ba6b7cb344b 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1720,7 +1720,7 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { - int result = -EBUSY; + int result; nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); if (IS_ERR(nvme_thread)) @@ -1730,7 +1730,7 @@ static int __init nvme_init(void) if (result < 0) goto kill_kthread; else if (result > 0) - nvme_major = result; + nvme_major = result; result = pci_register_driver(&nvme_driver); if (result) From cd58ad7d188c643cf572b038909c2f7dd96fdafe Mon Sep 17 00:00:00 2001 From: Quoc-Son Anh Date: Tue, 21 Feb 2012 16:50:53 -0700 Subject: [PATCH 10/12] NVMe: Use ida for nvme device instance Signed-off-by: Quoc-Son Anh Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 0ba6b7cb344b..3278fbdb8dc0 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -1576,15 +1576,33 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } -/* XXX: Use an ida or something to let remove / add work correctly */ -static void nvme_set_instance(struct nvme_dev *dev) +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_dev *dev) { - static int instance; - dev->instance = instance++; + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + dev->instance = instance; + return 0; } static void nvme_release_instance(struct nvme_dev *dev) { + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, dev->instance); + spin_unlock(&dev_list_lock); } static int __devinit nvme_probe(struct pci_dev *pdev, @@ -1617,7 +1635,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, dev); dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - nvme_set_instance(dev); + result = nvme_set_instance(dev); + if (result) + goto disable; + dev->entry[0].vector = pdev->irq; result = nvme_setup_prp_pools(dev); From 9e866774aab5d2654b0fa8f97890f68913f05700 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Aug 2012 13:55:56 -0400 Subject: [PATCH 11/12] NVMe: Free admin queue memory on initialisation failure If the adapter fails initialisation, the memory allocated for the admin queue may not be freed. Split the memory freeing part of nvme_free_queue() into nvme_free_queue_mem() and call it in the case of initialisation failure. Signed-off-by: Matthew Wilcox Reported-by: Vishal Verma --- drivers/block/nvme.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 3278fbdb8dc0..214037055e2a 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -868,6 +868,15 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid, return nvme_submit_admin_cmd(dev, &c, result); } +static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + static void nvme_free_queue(struct nvme_dev *dev, int qid) { struct nvme_queue *nvmeq = dev->queues[qid]; @@ -882,11 +891,7 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) adapter_delete_cq(dev, qid); } - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); + nvme_free_queue_mem(nvmeq); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, @@ -982,7 +987,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) { - int result; + int result = 0; u32 aqa; u64 cap; unsigned long timeout; @@ -1012,17 +1017,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; dev->db_stride = NVME_CAP_STRIDE(cap); - while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { + while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { msleep(100); if (fatal_signal_pending(current)) - return -EINTR; + result = -EINTR; if (time_after(jiffies, timeout)) { dev_err(&dev->pci_dev->dev, "Device not ready; aborting initialisation\n"); - return -ENODEV; + result = -ENODEV; } } + if (result) { + nvme_free_queue_mem(nvmeq); + return result; + } + result = queue_request_irq(dev, nvmeq, "nvme admin"); dev->queues[0] = nvmeq; return result; From a09115b23e2002bb35b7bfd337683f00875671ec Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 7 Aug 2012 15:56:23 -0400 Subject: [PATCH 12/12] NVMe: Cancel outstanding IOs on queue deletion If the device is hot-unplugged while there are active commands, we should time out the I/Os so that upper layers don't just see the I/Os disappear. Signed-off-by: Matthew Wilcox --- drivers/block/nvme.c | 55 ++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 214037055e2a..f9ad514c9227 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -868,6 +868,33 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid, return nvme_submit_admin_cmd(dev, &c, result); } +/** + * nvme_cancel_ios - Cancel outstanding I/Os + * @queue: The queue to cancel I/Os on + * @timeout: True to only cancel I/Os which have timed out + */ +static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + unsigned long now = jiffies; + int cmdid; + + for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { + void *ctx; + nvme_completion_fn fn; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + }; + + if (timeout && !time_after(now, info[cmdid].timeout)) + continue; + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + ctx = cancel_cmdid(nvmeq, cmdid, &fn); + fn(nvmeq->dev, ctx, &cqe); + } +} + static void nvme_free_queue_mem(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), @@ -882,6 +909,10 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) struct nvme_queue *nvmeq = dev->queues[qid]; int vector = dev->entry[nvmeq->cq_vector].vector; + spin_lock_irq(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + spin_unlock_irq(&nvmeq->q_lock); + irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1236,26 +1267,6 @@ static const struct block_device_operations nvme_fops = { .compat_ioctl = nvme_ioctl, }; -static void nvme_timeout_ios(struct nvme_queue *nvmeq) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; - - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, }; - - if (!time_after(now, info[cmdid].timeout)) - continue; - dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq->dev, ctx, &cqe); - } -} - static void nvme_resubmit_bios(struct nvme_queue *nvmeq) { while (bio_list_peek(&nvmeq->sq_cong)) { @@ -1287,7 +1298,7 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); if (nvme_process_cq(nvmeq)) printk("process_cq did something\n"); - nvme_timeout_ios(nvmeq); + nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); spin_unlock_irq(&nvmeq->q_lock); } @@ -1549,8 +1560,6 @@ static int nvme_dev_remove(struct nvme_dev *dev) list_del(&dev->node); spin_unlock(&dev_list_lock); - /* TODO: wait all I/O finished or cancel them */ - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); del_gendisk(ns->disk);