From 7f870c81a068fd1e69fe556f7fe85a5ece144b0c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 28 Dec 2014 12:35:05 +0200 Subject: virtio_pci: drop virtio_config dependency virtio_pci does not depend on virtio_config: let's not include it, users can pull it in as necessary. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 35b552c7f330..509d630f04f4 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -39,7 +39,7 @@ #ifndef _LINUX_VIRTIO_PCI_H #define _LINUX_VIRTIO_PCI_H -#include +#include #ifndef VIRTIO_PCI_NO_LEGACY -- cgit v1.2.3 From 71d70c266c84c4e708bb36b20d0c0a29af42821c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 29 May 2013 11:52:22 +0930 Subject: virtio-pci: define layout for virtio 1.0 Based on patches by Michael S. Tsirkin , but I found it hard to follow so changed to use structures which are more self-documenting. Signed-off-by: Rusty Russell Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_pci.h | 62 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 509d630f04f4..4e054235358f 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -99,4 +99,66 @@ /* Vector value used to disable MSI for queue */ #define VIRTIO_MSI_NO_VECTOR 0xffff +#ifndef VIRTIO_PCI_NO_MODERN + +/* IDs for different capabilities. Must all exist. */ + +/* Common configuration */ +#define VIRTIO_PCI_CAP_COMMON_CFG 1 +/* Notifications */ +#define VIRTIO_PCI_CAP_NOTIFY_CFG 2 +/* ISR access */ +#define VIRTIO_PCI_CAP_ISR_CFG 3 +/* Device specific confiuration */ +#define VIRTIO_PCI_CAP_DEVICE_CFG 4 + +/* This is the PCI capability header: */ +struct virtio_pci_cap { + __u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ + __u8 cap_next; /* Generic PCI field: next ptr. */ + __u8 cap_len; /* Generic PCI field: capability length */ + __u8 type_and_bar; /* Upper 3 bits: bar. + * Lower 3 is VIRTIO_PCI_CAP_*_CFG. */ + __le32 offset; /* Offset within bar. */ + __le32 length; /* Length. */ +}; + +#define VIRTIO_PCI_CAP_BAR_SHIFT 5 +#define VIRTIO_PCI_CAP_BAR_MASK 0x7 +#define VIRTIO_PCI_CAP_TYPE_SHIFT 0 +#define VIRTIO_PCI_CAP_TYPE_MASK 0x7 + +struct virtio_pci_notify_cap { + struct virtio_pci_cap cap; + __le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */ +}; + +/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */ +struct virtio_pci_common_cfg { + /* About the whole device. */ + __le32 device_feature_select; /* read-write */ + __le32 device_feature; /* read-only */ + __le32 guest_feature_select; /* read-write */ + __le32 guest_feature; /* read-write */ + __le16 msix_config; /* read-write */ + __le16 num_queues; /* read-only */ + __u8 device_status; /* read-write */ + __u8 config_generation; /* read-only */ + + /* About a specific virtqueue. */ + __le16 queue_select; /* read-write */ + __le16 queue_size; /* read-write, power of 2. */ + __le16 queue_msix_vector; /* read-write */ + __le16 queue_enable; /* read-write */ + __le16 queue_notify_off; /* read-only */ + __le32 queue_desc_lo; /* read-write */ + __le32 queue_desc_hi; /* read-write */ + __le32 queue_avail_lo; /* read-write */ + __le32 queue_avail_hi; /* read-write */ + __le32 queue_used_lo; /* read-write */ + __le32 queue_used_hi; /* read-write */ +}; + +#endif /* VIRTIO_PCI_NO_MODERN */ + #endif -- cgit v1.2.3 From 1fcf0512c9c870e78e1c9898ecb9458593403466 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 11 Dec 2014 13:59:51 +0200 Subject: virtio_pci: modern driver Lightly tested against qemu. One thing *not* implemented here is separate mappings for descriptor/avail/used rings. That's nice to have, will be done later after we have core support. This also exposes the PCI layout to userspace, and adds macros for PCI layout offsets: QEMU wants it, so why not? Trust, but verify. Signed-off-by: Rusty Russell Signed-off-by: Michael S. Tsirkin --- drivers/virtio/Makefile | 2 +- drivers/virtio/virtio_pci_common.c | 14 +- drivers/virtio/virtio_pci_common.h | 25 +- drivers/virtio/virtio_pci_modern.c | 587 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_pci.h | 7 +- 5 files changed, 625 insertions(+), 10 deletions(-) create mode 100644 drivers/virtio/virtio_pci_modern.c (limited to 'include/uapi') diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index bf5104b56894..bd230d1c0533 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o -virtio_pci-y := virtio_pci_legacy.o virtio_pci_common.o +virtio_pci-y := virtio_pci_modern.o virtio_pci_legacy.o virtio_pci_common.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 457cbe29c8c4..8ae34a34f3af 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -505,7 +505,9 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, if (rc) goto err_request_regions; - rc = virtio_pci_legacy_probe(vp_dev); + rc = virtio_pci_modern_probe(vp_dev); + if (rc == -ENODEV) + rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; @@ -518,7 +520,10 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, return 0; err_register: - virtio_pci_legacy_remove(vp_dev); + if (vp_dev->ioaddr) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); err_probe: pci_release_regions(pci_dev); err_request_regions: @@ -534,7 +539,10 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) unregister_virtio_device(&vp_dev->vdev); - virtio_pci_legacy_remove(pci_dev); + if (vp_dev->ioaddr) + virtio_pci_legacy_remove(vp_dev); + else + virtio_pci_modern_remove(vp_dev); pci_release_regions(pci_dev); pci_disable_device(pci_dev); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 2b1e70db44a0..610c43f19230 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -53,12 +53,29 @@ struct virtio_pci_device { struct virtio_device vdev; struct pci_dev *pci_dev; + /* In legacy mode, these two point to within ->legacy. */ + /* Where to read and clear interrupt */ + u8 __iomem *isr; + + /* Modern only fields */ + /* The IO mapping for the PCI config space (non-legacy mode) */ + struct virtio_pci_common_cfg __iomem *common; + /* Device-specific data (non-legacy mode) */ + void __iomem *device; + + /* So we can sanity-check accesses. */ + size_t device_len; + + /* Capability for when we need to map notifications per-vq. */ + int notify_map_cap; + + /* Multiply queue_notify_off by this value. (non-legacy mode). */ + u32 notify_offset_multiplier; + + /* Legacy only field */ /* the IO mapping for the PCI config space */ void __iomem *ioaddr; - /* the IO mapping for ISR operation */ - void __iomem *isr; - /* a list of queues so we can dispatch IRQs */ spinlock_t lock; struct list_head virtqueues; @@ -129,5 +146,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu); int virtio_pci_legacy_probe(struct virtio_pci_device *); void virtio_pci_legacy_remove(struct virtio_pci_device *); +int virtio_pci_modern_probe(struct virtio_pci_device *); +void virtio_pci_modern_remove(struct virtio_pci_device *); #endif diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c new file mode 100644 index 000000000000..a3d81013e0c2 --- /dev/null +++ b/drivers/virtio/virtio_pci_modern.c @@ -0,0 +1,587 @@ +/* + * Virtio PCI driver - modern (virtio 1.0) device support + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Anthony Liguori + * Rusty Russell + * Michael S. Tsirkin + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#define VIRTIO_PCI_NO_LEGACY +#include "virtio_pci_common.h" + +static void __iomem *map_capability(struct pci_dev *dev, int off, + size_t minlen, + u32 align, + u32 start, u32 size, + size_t *len) +{ + u8 bar; + u32 offset, length; + void __iomem *p; + + pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap, + bar), + &bar); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, offset), + &offset); + pci_read_config_dword(dev, off + offsetof(struct virtio_pci_cap, length), + &length); + + if (length <= start) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>%u expected)\n", + length, start); + return NULL; + } + + if (length - start < minlen) { + dev_err(&dev->dev, + "virtio_pci: bad capability len %u (>=%zu expected)\n", + length, minlen); + return NULL; + } + + length -= start; + + if (start + offset < offset) { + dev_err(&dev->dev, + "virtio_pci: map wrap-around %u+%u\n", + start, offset); + return NULL; + } + + offset += start; + + if (offset & (align - 1)) { + dev_err(&dev->dev, + "virtio_pci: offset %u not aligned to %u\n", + offset, align); + return NULL; + } + + if (length > size) + length = size; + + if (len) + *len = length; + + if (minlen + offset < minlen || + minlen + offset > pci_resource_len(dev, bar)) { + dev_err(&dev->dev, + "virtio_pci: map virtio %zu@%u " + "out of range on bar %i length %lu\n", + minlen, offset, + bar, (unsigned long)pci_resource_len(dev, bar)); + return NULL; + } + + p = pci_iomap_range(dev, bar, offset, length); + if (!p) + dev_err(&dev->dev, + "virtio_pci: unable to map virtio %u@%u on bar %i\n", + length, offset, bar); + return p; +} + +static void iowrite64_twopart(u64 val, __le32 __iomem *lo, __le32 __iomem *hi) +{ + iowrite32((u32)val, lo); + iowrite32(val >> 32, hi); +} + +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u64 features; + + iowrite32(0, &vp_dev->common->device_feature_select); + features = ioread32(&vp_dev->common->device_feature); + iowrite32(1, &vp_dev->common->device_feature_select); + features |= ((u64)ioread32(&vp_dev->common->device_feature) << 32); + + return features; +} + +/* virtio config->finalize_features() implementation */ +static int vp_finalize_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + if (!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { + dev_err(&vdev->dev, "virtio: device uses modern interface " + "but does not have VIRTIO_F_VERSION_1\n"); + return -EINVAL; + } + + iowrite32(0, &vp_dev->common->guest_feature_select); + iowrite32((u32)vdev->features, &vp_dev->common->guest_feature); + iowrite32(1, &vp_dev->common->guest_feature_select); + iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature); + + return 0; +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > vp_dev->device_len); + + switch (len) { + case 1: + b = ioread8(vp_dev->device + offset); + memcpy(buf, &b, sizeof b); + break; + case 2: + w = cpu_to_le16(ioread16(vp_dev->device + offset)); + memcpy(buf, &w, sizeof w); + break; + case 4: + l = cpu_to_le32(ioread32(vp_dev->device + offset)); + memcpy(buf, &l, sizeof l); + break; + case 8: + l = cpu_to_le32(ioread32(vp_dev->device + offset)); + memcpy(buf, &l, sizeof l); + l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l)); + memcpy(buf + sizeof l, &l, sizeof l); + break; + default: + BUG(); + } +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u8 b; + __le16 w; + __le32 l; + + BUG_ON(offset + len > vp_dev->device_len); + + switch (len) { + case 1: + memcpy(&b, buf, sizeof b); + iowrite8(b, vp_dev->device + offset); + break; + case 2: + memcpy(&w, buf, sizeof w); + iowrite16(le16_to_cpu(w), vp_dev->device + offset); + break; + case 4: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset); + break; + case 8: + memcpy(&l, buf, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset); + memcpy(&l, buf + sizeof l, sizeof l); + iowrite32(le32_to_cpu(l), vp_dev->device + offset + sizeof l); + break; + default: + BUG(); + } +} + +static u32 vp_generation(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(&vp_dev->common->config_generation); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(&vp_dev->common->device_status); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + iowrite8(status, &vp_dev->common->device_status); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + iowrite8(0, &vp_dev->common->device_status); + /* Flush out the status write, and flush in device writes, + * including MSI-X interrupts, if any. */ + ioread8(&vp_dev->common->device_status); + /* Flush pending VQ/configuration callbacks. */ + vp_synchronize_vectors(vdev); +} + +static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) +{ + /* Setup the vector used for configuration events */ + iowrite16(vector, &vp_dev->common->msix_config); + /* Verify we had enough resources to assign the vector */ + /* Will also flush the write out to device */ + return ioread16(&vp_dev->common->msix_config); +} + +static size_t vring_pci_size(u16 num) +{ + /* We only need a cacheline separation. */ + return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES)); +} + +static void *alloc_virtqueue_pages(int *num) +{ + void *pages; + + /* TODO: allocate each queue chunk individually */ + for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) { + pages = alloc_pages_exact(vring_pci_size(*num), + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); + if (pages) + return pages; + } + + if (!*num) + return NULL; + + /* Try to get a single page. You are my only hope! */ + return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO); +} + +static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, + struct virtio_pci_vq_info *info, + unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name, + u16 msix_vec) +{ + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtqueue *vq; + u16 num, off; + int err; + + if (index >= ioread16(&cfg->num_queues)) + return ERR_PTR(-ENOENT); + + /* Select the queue we're interested in */ + iowrite16(index, &cfg->queue_select); + + /* Check if queue is either not available or already active. */ + num = ioread16(&cfg->queue_size); + if (!num || ioread8(&cfg->queue_enable)) + return ERR_PTR(-ENOENT); + + if (num & (num - 1)) { + dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); + return ERR_PTR(-EINVAL); + } + + /* get offset of notification word for this vq */ + off = ioread16(&cfg->queue_notify_off); + + info->num = num; + info->msix_vector = msix_vec; + + info->queue = alloc_virtqueue_pages(&info->num); + if (info->queue == NULL) + return ERR_PTR(-ENOMEM); + + /* create the vring */ + vq = vring_new_virtqueue(index, info->num, + SMP_CACHE_BYTES, &vp_dev->vdev, + true, info->queue, vp_notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto err_new_queue; + } + + /* activate the queue */ + iowrite16(num, &cfg->queue_size); + iowrite64_twopart(virt_to_phys(info->queue), + &cfg->queue_desc_lo, &cfg->queue_desc_hi); + iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)), + &cfg->queue_avail_lo, &cfg->queue_avail_hi); + iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)), + &cfg->queue_used_lo, &cfg->queue_used_hi); + + vq->priv = (void __force *)map_capability(vp_dev->pci_dev, + vp_dev->notify_map_cap, 2, 2, + off * vp_dev->notify_offset_multiplier, 2, + NULL); + + if (!vq->priv) { + err = -ENOMEM; + goto err_map_notify; + } + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + iowrite16(msix_vec, &cfg->queue_msix_vector); + msix_vec = ioread16(&cfg->queue_msix_vector); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto err_assign_vector; + } + } + + return vq; + +err_assign_vector: + pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv); +err_map_notify: + vring_del_virtqueue(vq); +err_new_queue: + free_pages_exact(info->queue, vring_pci_size(info->num)); + return ERR_PTR(err); +} + +static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtqueue *vq; + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names); + + if (rc) + return rc; + + /* Select and activate all queues. Has to be done last: once we do + * this, there's no way to go back except reset. + */ + list_for_each_entry(vq, &vdev->vqs, list) { + iowrite16(vq->index, &vp_dev->common->queue_select); + iowrite8(1, &vp_dev->common->queue_enable); + } + + return 0; +} + +static void del_vq(struct virtio_pci_vq_info *info) +{ + struct virtqueue *vq = info->vq; + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + + iowrite16(vq->index, &vp_dev->common->queue_select); + + if (vp_dev->msix_enabled) { + iowrite16(VIRTIO_MSI_NO_VECTOR, + &vp_dev->common->queue_msix_vector); + /* Flush the write out to device */ + ioread16(&vp_dev->common->queue_msix_vector); + } + + pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv); + + vring_del_virtqueue(vq); + + free_pages_exact(info->queue, vring_pci_size(info->num)); +} + +static const struct virtio_config_ops virtio_pci_config_ops = { + .get = vp_get, + .set = vp_set, + .generation = vp_generation, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vqs = vp_modern_find_vqs, + .del_vqs = vp_del_vqs, + .get_features = vp_get_features, + .finalize_features = vp_finalize_features, + .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, +}; + +/** + * virtio_pci_find_capability - walk capabilities to find device info. + * @dev: the pci device + * @cfg_type: the VIRTIO_PCI_CAP_* value we seek + * @ioresource_types: IORESOURCE_MEM and/or IORESOURCE_IO. + * + * Returns offset of the capability, or 0. + */ +static inline int virtio_pci_find_capability(struct pci_dev *dev, u8 cfg_type, + u32 ioresource_types) +{ + int pos; + + for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + pos > 0; + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { + u8 type, bar; + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + cfg_type), + &type); + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, + bar), + &bar); + + /* Ignore structures with reserved BAR values */ + if (bar > 0x5) + continue; + + if (type == cfg_type) { + if (pci_resource_len(dev, bar) && + pci_resource_flags(dev, bar) & ioresource_types) + return pos; + } + } + return 0; +} + +static void virtio_pci_release_dev(struct device *_d) +{ + struct virtio_device *vdev = dev_to_virtio(_d); + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + kfree(vp_dev); +} + +/* TODO: validate the ABI statically. */ +static inline void check_offsets(void) +{ +} + +/* the PCI probing function */ +int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + int err, common, isr, notify, device; + u32 notify_length; + + check_offsets(); + + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) + return -ENODEV; + + if (pci_dev->device < 0x1040) { + /* Transitional devices: use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + vp_dev->vdev.id.device = pci_dev->subsystem_device; + } else { + /* Modern devices: simply use PCI device id, but start from 0x1040. */ + vp_dev->vdev.id.device = pci_dev->device - 0x1040; + } + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + + if (virtio_device_is_legacy_only(vp_dev->vdev.id)) + return -ENODEV; + + /* check for a common config: if not, use legacy mode (bar 0). */ + common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, + IORESOURCE_IO | IORESOURCE_MEM); + if (!common) { + dev_info(&pci_dev->dev, + "virtio_pci: leaving for legacy driver\n"); + return -ENODEV; + } + + /* If common is there, these should be too... */ + isr = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_ISR_CFG, + IORESOURCE_IO | IORESOURCE_MEM); + notify = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_NOTIFY_CFG, + IORESOURCE_IO | IORESOURCE_MEM); + if (!isr || !notify) { + dev_err(&pci_dev->dev, + "virtio_pci: missing capabilities %i/%i/%i\n", + common, isr, notify); + return -EINVAL; + } + + /* Device capability is only mandatory for devices that have + * device-specific configuration. + */ + device = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_DEVICE_CFG, + IORESOURCE_IO | IORESOURCE_MEM); + + err = -EINVAL; + vp_dev->common = map_capability(pci_dev, common, + sizeof(struct virtio_pci_common_cfg), 4, + 0, sizeof(struct virtio_pci_common_cfg), + NULL); + if (!vp_dev->common) + goto err_map_common; + vp_dev->isr = map_capability(pci_dev, isr, sizeof(u8), 1, + 0, 1, + NULL); + if (!vp_dev->isr) + goto err_map_isr; + + /* Read notify_off_multiplier from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier), + &vp_dev->notify_offset_multiplier); + /* Read notify length from config space. */ + pci_read_config_dword(pci_dev, + notify + offsetof(struct virtio_pci_notify_cap, + cap.length), + ¬ify_length); + + vp_dev->notify_map_cap = notify; + + /* Again, we don't know how much we should map, but PAGE_SIZE + * is more than enough for all existing devices. + */ + if (device) { + vp_dev->device = map_capability(pci_dev, device, 0, 4, + 0, PAGE_SIZE, + &vp_dev->device_len); + if (!vp_dev->device) + goto err_map_device; + } + + vp_dev->vdev.config = &virtio_pci_config_ops; + + vp_dev->config_vector = vp_config_vector; + vp_dev->setup_vq = setup_vq; + vp_dev->del_vq = del_vq; + + return 0; + +err_map_device: + pci_iounmap(pci_dev, vp_dev->isr); +err_map_isr: + pci_iounmap(pci_dev, vp_dev->common); +err_map_common: + return err; +} + +void virtio_pci_modern_remove(struct virtio_pci_device *vp_dev) +{ + struct pci_dev *pci_dev = vp_dev->pci_dev; + + if (vp_dev->device) + pci_iounmap(pci_dev, vp_dev->device); + pci_iounmap(pci_dev, vp_dev->isr); + pci_iounmap(pci_dev, vp_dev->common); +} diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 4e054235358f..a2b2e1353e30 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -117,10 +117,11 @@ struct virtio_pci_cap { __u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ __u8 cap_next; /* Generic PCI field: next ptr. */ __u8 cap_len; /* Generic PCI field: capability length */ - __u8 type_and_bar; /* Upper 3 bits: bar. - * Lower 3 is VIRTIO_PCI_CAP_*_CFG. */ + __u8 cfg_type; /* Identifies the structure. */ + __u8 bar; /* Where to find it. */ + __u8 padding[3]; /* Pad to full dword. */ __le32 offset; /* Offset within bar. */ - __le32 length; /* Length. */ + __le32 length; /* Length of the structure, in bytes. */ }; #define VIRTIO_PCI_CAP_BAR_SHIFT 5 -- cgit v1.2.3 From 89461c4a12faa643fd7564037440d626b777b033 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 May 2013 16:29:32 +0930 Subject: virtio_pci: macros for PCI layout offsets QEMU wants it, so why not? Trust, but verify. Signed-off-by: Rusty Russell Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 60 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/virtio_pci.h | 36 +++++++++++++++++++---- 2 files changed, 90 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index a3d81013e0c2..b2e707ad81cf 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -464,9 +464,67 @@ static void virtio_pci_release_dev(struct device *_d) kfree(vp_dev); } -/* TODO: validate the ABI statically. */ +/* This is part of the ABI. Don't screw with it. */ static inline void check_offsets(void) { + /* Note: disk space was harmed in compilation of this function. */ + BUILD_BUG_ON(VIRTIO_PCI_CAP_VNDR != + offsetof(struct virtio_pci_cap, cap_vndr)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_NEXT != + offsetof(struct virtio_pci_cap, cap_next)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LEN != + offsetof(struct virtio_pci_cap, cap_len)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_CFG_TYPE != + offsetof(struct virtio_pci_cap, cfg_type)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_BAR != + offsetof(struct virtio_pci_cap, bar)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_OFFSET != + offsetof(struct virtio_pci_cap, offset)); + BUILD_BUG_ON(VIRTIO_PCI_CAP_LENGTH != + offsetof(struct virtio_pci_cap, length)); + BUILD_BUG_ON(VIRTIO_PCI_NOTIFY_CAP_MULT != + offsetof(struct virtio_pci_notify_cap, + notify_off_multiplier)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DFSELECT != + offsetof(struct virtio_pci_common_cfg, + device_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_DF != + offsetof(struct virtio_pci_common_cfg, device_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GFSELECT != + offsetof(struct virtio_pci_common_cfg, + guest_feature_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_GF != + offsetof(struct virtio_pci_common_cfg, guest_feature)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_MSIX != + offsetof(struct virtio_pci_common_cfg, msix_config)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_NUMQ != + offsetof(struct virtio_pci_common_cfg, num_queues)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_STATUS != + offsetof(struct virtio_pci_common_cfg, device_status)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_CFGGENERATION != + offsetof(struct virtio_pci_common_cfg, config_generation)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SELECT != + offsetof(struct virtio_pci_common_cfg, queue_select)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_SIZE != + offsetof(struct virtio_pci_common_cfg, queue_size)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_MSIX != + offsetof(struct virtio_pci_common_cfg, queue_msix_vector)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_ENABLE != + offsetof(struct virtio_pci_common_cfg, queue_enable)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_NOFF != + offsetof(struct virtio_pci_common_cfg, queue_notify_off)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCLO != + offsetof(struct virtio_pci_common_cfg, queue_desc_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_DESCHI != + offsetof(struct virtio_pci_common_cfg, queue_desc_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILLO != + offsetof(struct virtio_pci_common_cfg, queue_avail_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_AVAILHI != + offsetof(struct virtio_pci_common_cfg, queue_avail_hi)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDLO != + offsetof(struct virtio_pci_common_cfg, queue_used_lo)); + BUILD_BUG_ON(VIRTIO_PCI_COMMON_Q_USEDHI != + offsetof(struct virtio_pci_common_cfg, queue_used_hi)); } /* the PCI probing function */ diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index a2b2e1353e30..3b7e4d2765fb 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -124,11 +124,6 @@ struct virtio_pci_cap { __le32 length; /* Length of the structure, in bytes. */ }; -#define VIRTIO_PCI_CAP_BAR_SHIFT 5 -#define VIRTIO_PCI_CAP_BAR_MASK 0x7 -#define VIRTIO_PCI_CAP_TYPE_SHIFT 0 -#define VIRTIO_PCI_CAP_TYPE_MASK 0x7 - struct virtio_pci_notify_cap { struct virtio_pci_cap cap; __le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */ @@ -160,6 +155,37 @@ struct virtio_pci_common_cfg { __le32 queue_used_hi; /* read-write */ }; +/* Macro versions of offsets for the Old Timers! */ +#define VIRTIO_PCI_CAP_VNDR 0 +#define VIRTIO_PCI_CAP_NEXT 1 +#define VIRTIO_PCI_CAP_LEN 2 +#define VIRTIO_PCI_CAP_CFG_TYPE 3 +#define VIRTIO_PCI_CAP_BAR 4 +#define VIRTIO_PCI_CAP_OFFSET 8 +#define VIRTIO_PCI_CAP_LENGTH 12 + +#define VIRTIO_PCI_NOTIFY_CAP_MULT 16 + +#define VIRTIO_PCI_COMMON_DFSELECT 0 +#define VIRTIO_PCI_COMMON_DF 4 +#define VIRTIO_PCI_COMMON_GFSELECT 8 +#define VIRTIO_PCI_COMMON_GF 12 +#define VIRTIO_PCI_COMMON_MSIX 16 +#define VIRTIO_PCI_COMMON_NUMQ 18 +#define VIRTIO_PCI_COMMON_STATUS 20 +#define VIRTIO_PCI_COMMON_CFGGENERATION 21 +#define VIRTIO_PCI_COMMON_Q_SELECT 22 +#define VIRTIO_PCI_COMMON_Q_SIZE 24 +#define VIRTIO_PCI_COMMON_Q_MSIX 26 +#define VIRTIO_PCI_COMMON_Q_ENABLE 28 +#define VIRTIO_PCI_COMMON_Q_NOFF 30 +#define VIRTIO_PCI_COMMON_Q_DESCLO 32 +#define VIRTIO_PCI_COMMON_Q_DESCHI 36 +#define VIRTIO_PCI_COMMON_Q_AVAILLO 40 +#define VIRTIO_PCI_COMMON_Q_AVAILHI 44 +#define VIRTIO_PCI_COMMON_Q_USEDLO 48 +#define VIRTIO_PCI_COMMON_Q_USEDHI 52 + #endif /* VIRTIO_PCI_NO_MODERN */ #endif -- cgit v1.2.3 From 25e65e4efca4116a9fc7a892ede2cf98f138de45 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 15 Jan 2015 13:33:31 +0200 Subject: virtio_balloon: coding style fixes Most of our code has struct foo { } Fix two instances where balloon is inconsistent. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 3 +-- include/uapi/linux/virtio_balloon.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 3176ea4028a8..0413157f3b49 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -44,8 +44,7 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); -struct virtio_balloon -{ +struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index be40f7059e93..4b0488f20b2e 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -36,8 +36,7 @@ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 -struct virtio_balloon_config -{ +struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ __le32 num_pages; /* Number of pages we've actually got in balloon. */ -- cgit v1.2.3 From eb710b152003ea74561512b3f55c1e3c71dcef39 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Wed, 24 Dec 2014 14:52:04 +0900 Subject: Btrfs: Remove unnecessary placeholder in btrfs_err_code "notused" is not necessary. Set 1 to the first entry is enough. Signed-off-by: Takeuchi Satoru Reviewed-by: David Sterba Signed-off-by: Chris Mason --- include/uapi/linux/btrfs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 611e1c5893b4..b6dec05c7196 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args { /* Error codes as returned by the kernel */ enum btrfs_err_code { - notused, - BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, -- cgit v1.2.3 From 6140a8f5623820cec7f56c63444b9551d8d35775 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 6 Feb 2015 15:05:08 -0700 Subject: vfio-pci: Add device request interface Userspace can opt to receive a device request notification, indicating that the device should be released. This is setup the same way as the error IRQ and also supports eventfd signaling. Future support may forcefully remove the device from the user if the request is ignored. Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 21 ++++++++++++++++++++- drivers/vfio/pci/vfio_pci_intrs.c | 16 ++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h | 1 + include/uapi/linux/vfio.h | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 7cc0122a18ce..f8a186381ae8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { + return 1; + } return 0; } @@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data, switch (info.index) { case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) @@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) req_len, vma->vm_page_prot); } +static void vfio_pci_request(void *device_data, unsigned int count) +{ + struct vfio_pci_device *vdev = device_data; + + mutex_lock(&vdev->igate); + + if (vdev->req_trigger) { + dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); + eventfd_signal(vdev->req_trigger, 1); + } + + mutex_unlock(&vdev->igate); +} + static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", .open = vfio_pci_open, @@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .read = vfio_pci_read, .write = vfio_pci_write, .mmap = vfio_pci_mmap, + .request = vfio_pci_request, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index b134befbfd0e..f88bfdf5b6a0 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -817,6 +817,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); } +static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); +} + int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -858,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, func = vfio_pci_set_err_trigger; break; } + case VFIO_PCI_REQ_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_pci_set_req_trigger; + break; + } } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 671c17a6e6d0..c9f9b323f152 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -58,6 +58,7 @@ struct vfio_pci_device { struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; + struct eventfd_ctx *req_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 29715d27548f..82889c30f4f5 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -333,6 +333,7 @@ enum { VFIO_PCI_MSI_IRQ_INDEX, VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- cgit v1.2.3 From e6a02746e0a9cdda5114db912fe2aadfed289aae Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:01:14 +1030 Subject: virtio: define VIRTIO_PCI_CAP_PCI_CFG in header. This provides backdoor access to the device MMIOs, and every device should have one. From the virtio 1.0 spec (CS03): 4.1.4.7.1 Device Requirements: PCI configuration access capability The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG capability. Signed-off-by: Rusty Russell Acked-by: Michael S. Tsirkin --- include/uapi/linux/virtio_pci.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 3b7e4d2765fb..75301468359f 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -109,8 +109,10 @@ #define VIRTIO_PCI_CAP_NOTIFY_CFG 2 /* ISR access */ #define VIRTIO_PCI_CAP_ISR_CFG 3 -/* Device specific confiuration */ +/* Device specific configuration */ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 +/* PCI configuration access */ +#define VIRTIO_PCI_CAP_PCI_CFG 5 /* This is the PCI capability header: */ struct virtio_pci_cap { -- cgit v1.2.3 From 527100a4ee744bbfc90f1609ee4a0144883b3e4a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:01:14 +1030 Subject: virtio: Don't expose legacy block features when VIRTIO_BLK_NO_LEGACY defined. This allows modern implementations to ensure they don't use legacy feature bits or SCSI commands (which are not used in v1.0 non-legacy). Signed-off-by: Rusty Russell Acked-by: Michael S. Tsirkin --- include/uapi/linux/virtio_blk.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 247c8ba8544a..3c53eec4ae22 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -31,22 +31,25 @@ #include /* Feature bits */ -#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ #define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ #define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ #define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */ #define VIRTIO_BLK_F_RO 5 /* Disk is read-only */ #define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/ -#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */ -#define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */ #define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */ -#define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */ #define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ +/* Legacy feature bits */ +#ifndef VIRTIO_BLK_NO_LEGACY +#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */ +#define VIRTIO_BLK_F_WCE 9 /* Writeback mode enabled after reset */ +#define VIRTIO_BLK_F_CONFIG_WCE 11 /* Writeback mode available in config */ #ifndef __KERNEL__ /* Old (deprecated) name for VIRTIO_BLK_F_WCE. */ #define VIRTIO_BLK_F_FLUSH VIRTIO_BLK_F_WCE #endif +#endif /* !VIRTIO_BLK_NO_LEGACY */ #define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ @@ -100,8 +103,10 @@ struct virtio_blk_config { #define VIRTIO_BLK_T_IN 0 #define VIRTIO_BLK_T_OUT 1 +#ifndef VIRTIO_BLK_NO_LEGACY /* This bit says it's a scsi command, not an actual read or write. */ #define VIRTIO_BLK_T_SCSI_CMD 2 +#endif /* VIRTIO_BLK_NO_LEGACY */ /* Cache flush command */ #define VIRTIO_BLK_T_FLUSH 4 @@ -109,8 +114,10 @@ struct virtio_blk_config { /* Get device ID command */ #define VIRTIO_BLK_T_GET_ID 8 +#ifndef VIRTIO_BLK_NO_LEGACY /* Barrier before this op. */ #define VIRTIO_BLK_T_BARRIER 0x80000000 +#endif /* !VIRTIO_BLK_NO_LEGACY */ /* This is the first element of the read scatter-gather list. */ struct virtio_blk_outhdr { @@ -122,12 +129,14 @@ struct virtio_blk_outhdr { __virtio64 sector; }; +#ifndef VIRTIO_BLK_NO_LEGACY struct virtio_scsi_inhdr { __virtio32 errors; __virtio32 data_len; __virtio32 sense_len; __virtio32 residual; }; +#endif /* !VIRTIO_BLK_NO_LEGACY */ /* And this is the final byte of the write scatter-gather list. */ #define VIRTIO_BLK_S_OK 0 -- cgit v1.2.3 From 6d96ee98b1d08bcf0f90a6bf2c6766dda6b3a010 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:01:14 +1030 Subject: virtio: Don't expose legacy config features when VIRTIO_CONFIG_NO_LEGACY defined. The VIRTIO_F_ANY_LAYOUT and VIRTIO_F_NOTIFY_ON_EMPTY features are pre-1.0 only. Signed-off-by: Rusty Russell Acked-by: Michael S. Tsirkin --- include/uapi/linux/virtio_config.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index a6d0cdeaacd4..c18264df9504 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -49,12 +49,14 @@ #define VIRTIO_TRANSPORT_F_START 28 #define VIRTIO_TRANSPORT_F_END 33 +#ifndef VIRTIO_CONFIG_NO_LEGACY /* Do we get callbacks when the ring is completely used, even if we've * suppressed them? */ #define VIRTIO_F_NOTIFY_ON_EMPTY 24 /* Can the device handle any descriptor layout? */ #define VIRTIO_F_ANY_LAYOUT 27 +#endif /* VIRTIO_CONFIG_NO_LEGACY */ /* v1.0 compliant. */ #define VIRTIO_F_VERSION_1 32 -- cgit v1.2.3 From 9791554b45a2acc28247f66a5fd5bbc212a6b8c8 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 8 Jan 2015 12:17:37 +0000 Subject: MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS Userland code may be built using an ABI which permits linking to objects that have more restrictive floating point requirements. For example, userland code may be built to target the O32 FPXX ABI. Such code may be linked with other FPXX code, or code built for either one of the more restrictive FP32 or FP64. When linking with more restrictive code, the overall requirement of the process becomes that of the more restrictive code. The kernel has no way to know in advance which mode the process will need to be executed in, and indeed it may need to change during execution. The dynamic loader is the only code which will know the overall required mode, and so it needs to have a means to instruct the kernel to switch the FP mode of the process. This patch introduces 2 new options to the prctl syscall which provide such a capability. The FP mode of the process is represented as a simple bitmask combining a number of mode bits mirroring those present in the hardware. Userland can either retrieve the current FP mode of the process: mode = prctl(PR_GET_FP_MODE); or modify the current FP mode of the process: err = prctl(PR_SET_FP_MODE, new_mode); Signed-off-by: Paul Burton Cc: Matthew Fortune Cc: Markos Chandras Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8899/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mmu.h | 3 ++ arch/mips/include/asm/mmu_context.h | 2 + arch/mips/include/asm/processor.h | 11 +++++ arch/mips/kernel/process.c | 92 +++++++++++++++++++++++++++++++++++++ arch/mips/kernel/traps.c | 19 ++++++++ include/uapi/linux/prctl.h | 5 ++ kernel/sys.c | 12 +++++ 7 files changed, 144 insertions(+) (limited to 'include/uapi') diff --git a/arch/mips/include/asm/mmu.h b/arch/mips/include/asm/mmu.h index c436138945a8..1afa1f986df8 100644 --- a/arch/mips/include/asm/mmu.h +++ b/arch/mips/include/asm/mmu.h @@ -1,9 +1,12 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#include + typedef struct { unsigned long asid[NR_CPUS]; void *vdso; + atomic_t fp_mode_switching; } mm_context_t; #endif /* __ASM_MMU_H */ diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h index 2f82568a3ee4..87f11072f557 100644 --- a/arch/mips/include/asm/mmu_context.h +++ b/arch/mips/include/asm/mmu_context.h @@ -132,6 +132,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) for_each_possible_cpu(i) cpu_context(i, mm) = 0; + atomic_set(&mm->context.fp_mode_switching, 0); + return 0; } diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index f1df4cb4a286..9daa38608cd8 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -399,4 +399,15 @@ unsigned long get_wchan(struct task_struct *p); #endif +/* + * Functions & macros implementing the PR_GET_FP_MODE & PR_SET_FP_MODE options + * to the prctl syscall. + */ +extern int mips_get_process_fp_mode(struct task_struct *task); +extern int mips_set_process_fp_mode(struct task_struct *task, + unsigned int value); + +#define GET_FP_MODE(task) mips_get_process_fp_mode(task) +#define SET_FP_MODE(task,value) mips_set_process_fp_mode(task, value) + #endif /* _ASM_PROCESSOR_H */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index eb76434828e8..4677b4c67da6 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -550,3 +551,94 @@ void arch_trigger_all_cpu_backtrace(bool include_self) { smp_call_function(arch_dump_stack, NULL, 1); } + +int mips_get_process_fp_mode(struct task_struct *task) +{ + int value = 0; + + if (!test_tsk_thread_flag(task, TIF_32BIT_FPREGS)) + value |= PR_FP_MODE_FR; + if (test_tsk_thread_flag(task, TIF_HYBRID_FPREGS)) + value |= PR_FP_MODE_FRE; + + return value; +} + +int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) +{ + const unsigned int known_bits = PR_FP_MODE_FR | PR_FP_MODE_FRE; + unsigned long switch_count; + struct task_struct *t; + + /* Check the value is valid */ + if (value & ~known_bits) + return -EOPNOTSUPP; + + /* Avoid inadvertently triggering emulation */ + if ((value & PR_FP_MODE_FR) && cpu_has_fpu && + !(current_cpu_data.fpu_id & MIPS_FPIR_F64)) + return -EOPNOTSUPP; + if ((value & PR_FP_MODE_FRE) && cpu_has_fpu && !cpu_has_fre) + return -EOPNOTSUPP; + + /* Save FP & vector context, then disable FPU & MSA */ + if (task->signal == current->signal) + lose_fpu(1); + + /* Prevent any threads from obtaining live FP context */ + atomic_set(&task->mm->context.fp_mode_switching, 1); + smp_mb__after_atomic(); + + /* + * If there are multiple online CPUs then wait until all threads whose + * FP mode is about to change have been context switched. This approach + * allows us to only worry about whether an FP mode switch is in + * progress when FP is first used in a tasks time slice. Pretty much all + * of the mode switch overhead can thus be confined to cases where mode + * switches are actually occuring. That is, to here. However for the + * thread performing the mode switch it may take a while... + */ + if (num_online_cpus() > 1) { + spin_lock_irq(&task->sighand->siglock); + + for_each_thread(task, t) { + if (t == current) + continue; + + switch_count = t->nvcsw + t->nivcsw; + + do { + spin_unlock_irq(&task->sighand->siglock); + cond_resched(); + spin_lock_irq(&task->sighand->siglock); + } while ((t->nvcsw + t->nivcsw) == switch_count); + } + + spin_unlock_irq(&task->sighand->siglock); + } + + /* + * There are now no threads of the process with live FP context, so it + * is safe to proceed with the FP mode switch. + */ + for_each_thread(task, t) { + /* Update desired FP register width */ + if (value & PR_FP_MODE_FR) { + clear_tsk_thread_flag(t, TIF_32BIT_FPREGS); + } else { + set_tsk_thread_flag(t, TIF_32BIT_FPREGS); + clear_tsk_thread_flag(t, TIF_MSA_CTX_LIVE); + } + + /* Update desired FP single layout */ + if (value & PR_FP_MODE_FRE) + set_tsk_thread_flag(t, TIF_HYBRID_FPREGS); + else + clear_tsk_thread_flag(t, TIF_HYBRID_FPREGS); + } + + /* Allow threads to use FP again */ + atomic_set(&task->mm->context.fp_mode_switching, 0); + + return 0; +} diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index ad3d2031c327..d5fbfb51b9da 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1134,10 +1134,29 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action, return NOTIFY_OK; } +static int wait_on_fp_mode_switch(atomic_t *p) +{ + /* + * The FP mode for this task is currently being switched. That may + * involve modifications to the format of this tasks FP context which + * make it unsafe to proceed with execution for the moment. Instead, + * schedule some other task. + */ + schedule(); + return 0; +} + static int enable_restore_fp_context(int msa) { int err, was_fpu_owner, prior_msa; + /* + * If an FP mode switch is currently underway, wait for it to + * complete before proceeding. + */ + wait_on_atomic_t(¤t->mm->context.fp_mode_switching, + wait_on_fp_mode_switch, TASK_KILLABLE); + if (!used_math()) { /* First time FP context user. */ preempt_disable(); diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 89f63503f903..31891d9535e2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,4 +185,9 @@ struct prctl_mm_map { #define PR_MPX_ENABLE_MANAGEMENT 43 #define PR_MPX_DISABLE_MANAGEMENT 44 +#define PR_SET_FP_MODE 45 +#define PR_GET_FP_MODE 46 +# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */ +# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */ + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index a8c9f5a7dda6..08b16bbdcdf0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -97,6 +97,12 @@ #ifndef MPX_DISABLE_MANAGEMENT # define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) #endif +#ifndef GET_FP_MODE +# define GET_FP_MODE(a) (-EINVAL) +#endif +#ifndef SET_FP_MODE +# define SET_FP_MODE(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2215,6 +2221,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_MPX_DISABLE_MANAGEMENT: error = MPX_DISABLE_MANAGEMENT(me); break; + case PR_SET_FP_MODE: + error = SET_FP_MODE(me, arg2); + break; + case PR_GET_FP_MODE: + error = GET_FP_MODE(me); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From ed9ecb0415b97b5f9f91f146e1977bb372c74c6d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Feb 2015 17:13:44 +1030 Subject: virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined. In particular, the virtio header always has the u16 num_buffers field. We define a new 'struct virtio_net_hdr_v1' for this (rather than simply calling it 'struct virtio_net_hdr', to avoid nasty type errors if some parts of a project define VIRTIO_NET_NO_LEGACY and some don't. Transitional devices (which can't define VIRTIO_NET_NO_LEGACY) will have to keep using struct virtio_net_hdr_mrg_rxbuf, which has the same byte layout as struct virtio_net_hdr_v1. Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_net.h | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index b5f1677b291c..4a9b58113d6e 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -35,7 +35,6 @@ #define VIRTIO_NET_F_CSUM 0 /* Host handles pkts w/ partial csum */ #define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ #define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ -#define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ #define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */ #define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */ #define VIRTIO_NET_F_GUEST_ECN 9 /* Guest can handle TSO[6] w/ ECN in. */ @@ -56,6 +55,10 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#ifndef VIRTIO_NET_NO_LEGACY +#define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ +#endif /* VIRTIO_NET_NO_LEGACY */ + #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ @@ -71,8 +74,9 @@ struct virtio_net_config { __u16 max_virtqueue_pairs; } __attribute__((packed)); +#ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. - * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must + * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must * be the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ struct virtio_net_hdr { @@ -97,6 +101,30 @@ struct virtio_net_hdr_mrg_rxbuf { struct virtio_net_hdr hdr; __virtio16 num_buffers; /* Number of merged rx buffers */ }; +#else /* ... VIRTIO_NET_NO_LEGACY */ +/* + * This header comes first in the scatter-gather list. If you don't + * specify GSO or CSUM features, you can simply ignore the header. + * + * This is bitwise-equivalent to the legacy struct virtio_net_hdr_mrg_rxbuf. + */ +struct virtio_net_hdr_v1 { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + __u8 flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + __u8 gso_type; + __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ + __virtio16 csum_start; /* Position to start checksumming from */ + __virtio16 csum_offset; /* Offset after that to place checksum */ + __virtio16 num_buffers; /* Number of merged rx buffers */ +}; +#endif /* ...VIRTIO_NET_NO_LEGACY */ /* * Control virtqueue data structures -- cgit v1.2.3 From e68c48f97547979c91de04b487d79dc0d3be7015 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 17 Feb 2015 16:12:43 +1030 Subject: virtio_net: unconditionally define struct virtio_net_hdr_v1. This was introduced in commit ed9ecb0415b97b5f9f91f146e1977bb372c74c6d, but only defined if !VIRTIO_NET_NO_LEGACY. We should always define it: easier for users to have conditional legacy code. Suggested-by: "Michael S. Tsirkin" Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_net.h | 54 +++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 4a9b58113d6e..7bbee79ca293 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -74,39 +74,12 @@ struct virtio_net_config { __u16 max_virtqueue_pairs; } __attribute__((packed)); -#ifndef VIRTIO_NET_NO_LEGACY -/* This header comes first in the scatter-gather list. - * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must - * be the first element of the scatter-gather list. If you don't - * specify GSO or CSUM features, you can simply ignore the header. */ -struct virtio_net_hdr { -#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset -#define VIRTIO_NET_HDR_F_DATA_VALID 2 // Csum is valid - __u8 flags; -#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame -#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) -#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) -#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP -#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set - __u8 gso_type; - __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ - __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ - __virtio16 csum_start; /* Position to start checksumming from */ - __virtio16 csum_offset; /* Offset after that to place checksum */ -}; - -/* This is the version of the header to use when the MRG_RXBUF - * feature has been negotiated. */ -struct virtio_net_hdr_mrg_rxbuf { - struct virtio_net_hdr hdr; - __virtio16 num_buffers; /* Number of merged rx buffers */ -}; -#else /* ... VIRTIO_NET_NO_LEGACY */ /* * This header comes first in the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. * - * This is bitwise-equivalent to the legacy struct virtio_net_hdr_mrg_rxbuf. + * This is bitwise-equivalent to the legacy struct virtio_net_hdr_mrg_rxbuf, + * only flattened. */ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ @@ -124,6 +97,29 @@ struct virtio_net_hdr_v1 { __virtio16 csum_offset; /* Offset after that to place checksum */ __virtio16 num_buffers; /* Number of merged rx buffers */ }; + +#ifndef VIRTIO_NET_NO_LEGACY +/* This header comes first in the scatter-gather list. + * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must + * be the first element of the scatter-gather list. If you don't + * specify GSO or CSUM features, you can simply ignore the header. */ +struct virtio_net_hdr { + /* See VIRTIO_NET_HDR_F_* */ + __u8 flags; + /* See VIRTIO_NET_HDR_GSO_* */ + __u8 gso_type; + __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ + __virtio16 csum_start; /* Position to start checksumming from */ + __virtio16 csum_offset; /* Offset after that to place checksum */ +}; + +/* This is the version of the header to use when the MRG_RXBUF + * feature has been negotiated. */ +struct virtio_net_hdr_mrg_rxbuf { + struct virtio_net_hdr hdr; + __virtio16 num_buffers; /* Number of merged rx buffers */ +}; #endif /* ...VIRTIO_NET_NO_LEGACY */ /* -- cgit v1.2.3 From 02d1aa7af17ef0e0655745ce32cab369ed040a67 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 8 Feb 2015 13:28:50 +0200 Subject: IB/core: Add support for extended query device caps Add extensible query device capabilities verb to allow adding new features. ib_uverbs_ex_query_device is added and copy_query_dev_fields is used to copy capability fields to be used by both ib_uverbs_query_device and ib_uverbs_ex_query_device. Following the discussion about this patch [1], the code now validates the command's comp_mask is zero, returning -EINVAL for unknown values, in order to allow extending the verb in the future. The verb also checks the user-space provided response buffer size and only fills in capabilities that will fit in the buffer. In attempt to follow the spirit of presentation [2] by Tzahi Oved that was presented during OpenFabrics Alliance International Developer Workshop 2013, the comp_mask bits will only describe which fields are valid. Furthermore, fields that can simply be cleared when they are not supported, do not require a comp_mask bit at all. The verb returns a response_length field containing the actual number of bytes written by the kernel, so that a newer version running on an older kernel can tell which fields were actually returned. [1] [PATCH v1 0/5] IB/core: extended query device caps cleanup for v3.19 http://thread.gmane.org/gmane.linux.kernel.api/7889/ [2] https://www.openfabrics.org/images/docs/2013_Dev_Workshop/Tues_0423/2013_Workshop_Tues_0830_Tzahi_Oved-verbs_extensions_ofa_2013-tzahio.pdf Signed-off-by: Eli Cohen Signed-off-by: Haggai Eran Reviewed-by: Yann Droneaud Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 131 +++++++++++++++++++++++----------- drivers/infiniband/core/uverbs_main.c | 1 + include/uapi/rdma/ib_user_verbs.h | 12 ++++ 4 files changed, 104 insertions(+), 41 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); +IB_UVERBS_DECLARE_EX_CMD(query_device); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index b7943ff16ed3..8f507538c42b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -400,6 +400,52 @@ err: return ret; } +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +} + ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -420,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return ret; memset(&resp, 0, sizeof resp); - - resp.fw_ver = attr.fw_ver; - resp.node_guid = file->device->ib_dev->node_guid; - resp.sys_image_guid = attr.sys_image_guid; - resp.max_mr_size = attr.max_mr_size; - resp.page_size_cap = attr.page_size_cap; - resp.vendor_id = attr.vendor_id; - resp.vendor_part_id = attr.vendor_part_id; - resp.hw_ver = attr.hw_ver; - resp.max_qp = attr.max_qp; - resp.max_qp_wr = attr.max_qp_wr; - resp.device_cap_flags = attr.device_cap_flags; - resp.max_sge = attr.max_sge; - resp.max_sge_rd = attr.max_sge_rd; - resp.max_cq = attr.max_cq; - resp.max_cqe = attr.max_cqe; - resp.max_mr = attr.max_mr; - resp.max_pd = attr.max_pd; - resp.max_qp_rd_atom = attr.max_qp_rd_atom; - resp.max_ee_rd_atom = attr.max_ee_rd_atom; - resp.max_res_rd_atom = attr.max_res_rd_atom; - resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; - resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; - resp.atomic_cap = attr.atomic_cap; - resp.max_ee = attr.max_ee; - resp.max_rdd = attr.max_rdd; - resp.max_mw = attr.max_mw; - resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; - resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; - resp.max_mcast_grp = attr.max_mcast_grp; - resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; - resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; - resp.max_ah = attr.max_ah; - resp.max_fmr = attr.max_fmr; - resp.max_map_per_fmr = attr.max_map_per_fmr; - resp.max_srq = attr.max_srq; - resp.max_srq_wr = attr.max_srq_wr; - resp.max_srq_sge = attr.max_srq_sge; - resp.max_pkeys = attr.max_pkeys; - resp.local_ca_ack_delay = attr.local_ca_ack_delay; - resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; + copy_query_dev_fields(file, &resp, &attr); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -3287,3 +3293,46 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } + +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr; + struct ib_device *device; + int err; + + device = file->device->ib_dev; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + resp.response_length = sizeof(resp); + + if (ucore->outlen < resp.response_length) + return -ENOSPC; + + err = device->query_device(device, &attr); + if (err) + return err; + + copy_query_dev_fields(file, &resp.base, &attr); + resp.comp_mask = 0; + + err = ib_copy_to_udata(ucore, &resp, resp.response_length); + if (err) + return err; + + return 0; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5db1a8cc388d..259dcc7779f5 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -123,6 +123,7 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device, }; static void ib_uverbs_add_one(struct ib_device *device); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 867cc5084afb..f0f799afd856 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -90,6 +90,7 @@ enum { }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW, }; @@ -201,6 +202,17 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 response_length; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; -- cgit v1.2.3 From f4056bfd8ccff4475417078d6e5456dfa1962dd3 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Sun, 8 Feb 2015 13:28:51 +0200 Subject: IB/core: Add on demand paging caps to ib_uverbs_ex_query_device Add on-demand paging capabilities reporting to the extended query device verb. Yann Droneaud writes: Note: as offsetof() is used to retrieve the size of the lower chunk of the response, beware that it only works if the upper chunk is right after, without any implicit padding. And, as the size of the latter chunk is added to the base size, implicit padding at the end of the structure is not taken in account. Both point must be taken in account when extending the uverbs functionalities. Signed-off-by: Haggai Eran Reviewed-by: Yann Droneaud Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 20 +++++++++++++++++++- include/uapi/rdma/ib_user_verbs.h | 11 +++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 8f507538c42b..04ca04559ce5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3318,7 +3318,7 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, if (cmd.reserved) return -EINVAL; - resp.response_length = sizeof(resp); + resp.response_length = offsetof(typeof(resp), odp_caps); if (ucore->outlen < resp.response_length) return -ENOSPC; @@ -3330,6 +3330,24 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, copy_query_dev_fields(file, &resp.base, &attr); resp.comp_mask = 0; + if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) + goto end; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.odp_caps.reserved = 0; +#else + memset(&resp.odp_caps, 0, sizeof(resp.odp_caps)); +#endif + resp.response_length += sizeof(resp.odp_caps); + +end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); if (err) return err; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index f0f799afd856..b513e662d8e4 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -207,10 +207,21 @@ struct ib_uverbs_ex_query_device { __u32 reserved; }; +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_query_device_resp base; __u32 comp_mask; __u32 response_length; + struct ib_uverbs_odp_caps odp_caps; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From e1e5e5641e6f271321aec257ed26a72715e4a8c2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:39:03 -0700 Subject: NVMe: Metadata format support Adds support for NVMe metadata formats and exposes block devices for all namespaces regardless of their format. Namespace formats that are unusable will have disk capacity set to 0, but a handle to the block device is created to simplify device management. A namespace is not usable when the format requires host interleave block and metadata in single buffer, has no provisioned storage, or has better data but failed to register with blk integrity. The namespace has to be scanned in two phases to support separate metadata formats. The first establishes the sector size and capacity prior to invoking add_disk. If metadata is required, the capacity will be temporarilly set to 0 until it can be revalidated and registered with the integrity extenstions after add_disk completes. The driver relies on the integrity extensions to provide the metadata buffer. NVMe requires this be a single physically contiguous region, so only one integrity segment is allowed per command. If the metadata is used for T10 PI, the driver provides mappings to save and restore the reftag physical block translation. The driver provides no-op functions for generate and verify if metadata is not used for protection information. This way the setup is always provided by the block layer. If a request does not supply a required metadata buffer, the command is failed with bad address. This could only happen if a user manually disables verify/generate on such a disk. The only exception to where this is okay is if the controller is capable of stripping/generating the metadata, which is possible on some types of formats. The metadata scatter gather list now occupies the spot in the nvme_iod that used to be used to link retryable IOD's, but we don't do that anymore, so the field was unused. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 298 +++++++++++++++++++++++++++++++++++----------- include/linux/nvme.h | 2 + include/uapi/linux/nvme.h | 16 +++ 3 files changed, 249 insertions(+), 67 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cbdfbbf98392..3ffa57a932ea 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -482,6 +483,62 @@ static int nvme_error_status(u16 status) } } +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + if (!pmap) + return; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->integrity->tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} + static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { @@ -512,9 +569,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, "completing aborted command with status:%04x\n", status); - if (iod->nents) + if (iod->nents) { dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + } nvme_free_iod(nvmeq->dev, iod); blk_mq_complete_request(req); @@ -670,6 +734,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (blk_integrity_rq(req)) { + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + } else if (ns->ms) + control |= NVME_RW_PRINFO_PRACT; + cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -690,6 +772,19 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_iod *iod; enum dma_data_direction dma_dir; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8)) { + req->errors = -EFAULT; + blk_mq_complete_request(req); + return BLK_MQ_RQ_QUEUE_OK; + } + } + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; @@ -725,6 +820,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, iod->nents, dma_dir); goto retry_cmd; } + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + goto error_cmd; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg( + req->q, req->bio, iod->meta_sg) != 1) + goto error_cmd; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + goto error_cmd; + } } nvme_set_info(cmd, iod, req_completion); @@ -1875,13 +1985,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_noop_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +static int nvme_noop_generate(struct blk_integrity_iter *iter) +{ + return 0; +} + +struct blk_integrity nvme_meta_noop = { + .name = "NVME_META_NOOP", + .generate_fn = nvme_noop_generate, + .verify_fn = nvme_noop_verify, +}; + +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity = t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity = t10_pi_type1_crc; + break; + default: + integrity = nvme_meta_noop; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} + static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id; dma_addr_t dma_addr; - int lbaf; + int lbaf, pi_type, old_ms; + unsigned short bs; id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, GFP_KERNEL); @@ -1890,16 +2048,50 @@ static int nvme_revalidate_disk(struct gendisk *disk) __func__); return 0; } + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { + dev_warn(&dev->pci_dev->dev, + "identify failed ns:%d, setting capacity to 0\n", + ns->ns_id); + memset(id, 0, sizeof(*id)); + } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) - goto free; - - lbaf = id->flbas & 0xf; + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) && + !(id->flbas & NVME_NS_FLBAS_META_EXT)) + nvme_init_integrity(ns); + + if (id->ncap == 0 || (ns->ms && !disk->integrity)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - free: dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); return 0; } @@ -1956,30 +2148,16 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - ns->queue->limits.max_discard_sectors = 0xffffffff; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) +static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; int node = dev_to_node(&dev->pci_dev->dev); - int lbaf; - - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) - return NULL; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return NULL; + return; + ns->queue = blk_mq_init_queue(&dev->tagset); if (IS_ERR(ns->queue)) goto out_free_ns; @@ -1995,9 +2173,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, ns->ns_id = nsid; ns->disk = disk; - lbaf = id->flbas & 0xf; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + list_add_tail(&ns->list, &dev->namespaces); + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); @@ -2014,18 +2192,23 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, disk->driverfs_dev = &dev->pci_dev->dev; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - - return ns; + /* + * Initialize capacity to 0 until we establish the namespace format and + * setup integrity extentions if necessary. The revalidate_disk after + * add_disk allows the driver to register with integrity if the format + * requires it. + */ + set_capacity(disk, 0); + nvme_revalidate_disk(ns->disk); + add_disk(ns->disk); + if (ns->ms) + revalidate_disk(ns->disk); + return; out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - return NULL; } static void nvme_create_io_queues(struct nvme_dev *dev) @@ -2150,22 +2333,20 @@ static int nvme_dev_add(struct nvme_dev *dev) struct pci_dev *pdev = dev->pci_dev; int res; unsigned nn, i; - struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; - struct nvme_id_ns *id_ns; void *mem; dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); + mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); if (!mem) return -ENOMEM; res = nvme_identify(dev, 0, 1, dma_addr); if (res) { dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - res = -EIO; - goto out; + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + return -EIO; } ctrl = mem; @@ -2191,6 +2372,7 @@ static int nvme_dev_add(struct nvme_dev *dev) } else dev->max_hw_sectors = max_hw_sectors; } + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2203,33 +2385,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->tagset)) - goto out; - - id_ns = mem; - for (i = 1; i <= nn; i++) { - res = nvme_identify(dev, i, 0, dma_addr); - if (res) - continue; - - if (id_ns->ncap == 0) - continue; - - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096, NULL); - if (res) - memset(mem + 4096, 0, 4096); + return 0; - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); - if (ns) - list_add_tail(&ns->list, &dev->namespaces); - } - list_for_each_entry(ns, &dev->namespaces, list) - add_disk(ns->disk); - res = 0; + for (i = 1; i <= nn; i++) + nvme_alloc_ns(dev, i); - out: - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); - return res; + return 0; } static int nvme_dev_map(struct nvme_dev *dev) @@ -2528,8 +2689,11 @@ static void nvme_dev_remove(struct nvme_dev *dev) struct nvme_ns *ns; list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) + if (ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->integrity) + blk_integrity_unregister(ns->disk); del_gendisk(ns->disk); + } if (!blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 19a5d4b23209..cca264db2478 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -121,6 +121,7 @@ struct nvme_ns { unsigned ns_id; int lba_shift; int ms; + int pi_type; u64 mode_select_num_blocks; u32 mode_select_block_len; }; @@ -138,6 +139,7 @@ struct nvme_iod { int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ struct scatterlist sg[0]; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 26386cf3db44..406bfc95652c 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -124,10 +124,22 @@ struct nvme_id_ns { enum { NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, }; struct nvme_smart_log { @@ -261,6 +273,10 @@ enum { NVME_RW_DSM_LATENCY_LOW = 3 << 4, NVME_RW_DSM_SEQ_REQ = 1 << 6, NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, }; struct nvme_dsm_cmd { -- cgit v1.2.3 From 4f1982b4e262c45475a91b4253e9bc7f7c991c13 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:42:14 -0700 Subject: NVMe: Update SCSI Inquiry VPD 83h translation The original translation created collisions on Inquiry VPD 83 for many existing devices. Newer specifications provide other ways to translate based on the device's version can be used to create unique identifiers. Version 1.1 provides an EUI64 field that uniquely identifies each namespace, and 1.2 added the longer NGUID field for the same reason. Both follow the IEEE EUI format and readily translate to the SCSI device identification EUI designator type 2h. For devices implementing either, the translation will use this type, defaulting to the EUI64 8-byte type if implemented then NGUID's 16 byte version if not. If neither are provided, the 1.0 translation is used, and is updated to use the SCSI String format to guarantee a unique identifier. Knowing when to use the new fields depends on the nvme controller's revision. The NVME_VS macro was not decoding this correctly, so that is fixed in this patch and moved to a more appropriate place. Since the Identify Namespace structure required an update for the NGUID field, this patch adds the remaining new 1.2 fields to the structure. Signed-off-by: Keith Busch --- drivers/block/nvme-scsi.c | 94 ++++++++++++++++++++++++++--------------------- include/linux/nvme.h | 2 - include/uapi/linux/nvme.h | 10 ++++- 3 files changed, 62 insertions(+), 44 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 5e78568026c3..5cc907648742 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -779,10 +779,8 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_dev *dev = ns->dev; dma_addr_t dma_addr; void *mem; - struct nvme_id_ctrl *id_ctrl; int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - u8 ieee[4]; int xfer_len; __be32 tmp_id = cpu_to_be32(ns->ns_id); @@ -793,46 +791,60 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } - /* nvme controller identify */ - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ctrl = mem; - - /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ - ieee[0] = id_ctrl->ieee[0] << 4; - ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; - ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; - ieee[3] = id_ctrl->ieee[2] >> 4; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - inq_response[3] = 20; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 16; /* Designator Length */ - /* Designator start */ - inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ - inq_response[9] = ieee[2]; /* IEEE ID */ - inq_response[10] = ieee[1]; /* IEEE ID */ - inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ - inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8; - inq_response[13] = (dev->pci_dev->vendor & 0x00FF); - inq_response[14] = dev->serial[0]; - inq_response[15] = dev->serial[1]; - inq_response[16] = dev->model[0]; - inq_response[17] = dev->model[1]; - memcpy(&inq_response[18], &tmp_id, sizeof(u32)); - /* Last 2 bytes are zero */ + if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { + struct nvme_id_ns *id_ns = mem; + void *eui = id_ns->eui64; + int len = sizeof(id_ns->eui64); - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + if (bitmap_empty(eui, len * 8)) + goto scsi_string; + + inq_response[3] = 4 + len; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + } else { + scsi_string: + if (alloc_len < 72) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_free; + } + inq_response[3] = 0x48; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + sprintf(&inq_response[52], "%04x", tmp_id); + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + } + xfer_len = alloc_len; res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); out_free: @@ -2222,7 +2234,7 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, page_code = GET_INQ_PAGE_CODE(cmd); alloc_len = GET_INQ_ALLOC_LENGTH(cmd); - inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); + inq_response = kmalloc(alloc_len, GFP_KERNEL); if (inq_response == NULL) { res = -ENOMEM; goto out_mem; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index cca264db2478..1f062a9e521d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -62,8 +62,6 @@ enum { NVME_CSTS_SHST_MASK = 3 << 2, }; -#define NVME_VS(major, minor) (major << 16 | minor) - extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 406bfc95652c..aef9a81b2d75 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -115,7 +115,13 @@ struct nvme_id_ns { __le16 nawun; __le16 nawupf; __le16 nacwu; - __u8 rsvd40[80]; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __u16 rsvd46; + __le64 nvmcap[2]; + __u8 rsvd64[40]; + __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; __u8 rsvd192[192]; @@ -565,6 +571,8 @@ struct nvme_passthru_cmd { __u32 result; }; +#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) -- cgit v1.2.3 From 30ff54765976e132674e3eae2071ed8ed494665c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 23 Feb 2015 08:17:12 -0500 Subject: net: sched: export tc_connmark.h so it is uapi accessible Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 19d5219b0b99..242cf0c6e33d 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -9,3 +9,4 @@ header-y += tc_pedit.h header-y += tc_skbedit.h header-y += tc_vlan.h header-y += tc_bpf.h +header-y += tc_connmark.h -- cgit v1.2.3