diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2010-02-21 20:17:22 +0100 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2010-02-21 20:17:22 +0100 |
commit | 5f854cfc024622e4aae14d7cf422f6ff86278688 (patch) | |
tree | 426e77c6f6e4939c80440bf1fabcb020e3ee145b /virt | |
parent | cc24da0742870f152ddf1002aa39dfcd83f7cf9c (diff) | |
parent | 4ec62b2b2e6bd7ddef7b6cea6e5db7b5578a6532 (diff) | |
download | lwn-5f854cfc024622e4aae14d7cf422f6ff86278688.tar.gz lwn-5f854cfc024622e4aae14d7cf422f6ff86278688.zip |
Forward to 2.6.33-rc8
Merge branch 'linus' into rt/head with a pile of conflicts.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 14 | ||||
-rw-r--r-- | virt/kvm/assigned-dev.c | 818 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.c | 74 | ||||
-rw-r--r-- | virt/kvm/coalesced_mmio.h | 1 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 588 | ||||
-rw-r--r-- | virt/kvm/ioapic.c | 144 | ||||
-rw-r--r-- | virt/kvm/ioapic.h | 5 | ||||
-rw-r--r-- | virt/kvm/iodev.h | 55 | ||||
-rw-r--r-- | virt/kvm/irq_comm.c | 262 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 1231 | ||||
-rw-r--r-- | virt/kvm/kvm_trace.c | 285 |
11 files changed, 2119 insertions, 1358 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig new file mode 100644 index 000000000000..daece36c0a57 --- /dev/null +++ b/virt/kvm/Kconfig @@ -0,0 +1,14 @@ +# KVM common configuration items and defaults + +config HAVE_KVM + bool + +config HAVE_KVM_IRQCHIP + bool + +config HAVE_KVM_EVENTFD + bool + select EVENTFD + +config KVM_APIC_ARCHITECTURE + bool diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c new file mode 100644 index 000000000000..f73de631e3ee --- /dev/null +++ b/virt/kvm/assigned-dev.c @@ -0,0 +1,818 @@ +/* + * Kernel-based Virtual Machine - device assignment support + * + * Copyright (C) 2006-9 Red Hat, Inc + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/errno.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include "irq.h" + +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static int find_index_from_host_irq(struct kvm_assigned_dev_kernel + *assigned_dev, int irq) +{ + int i, index; + struct msix_entry *host_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + + index = -1; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == host_msix_entries[i].vector) { + index = i; + break; + } + if (index < 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + return 0; + } + + return index; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + struct kvm *kvm; + int i; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + kvm = assigned_dev->kvm; + + spin_lock_irq(&assigned_dev->assigned_dev_lock); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + struct kvm_guest_msix_entry *guest_entries = + assigned_dev->guest_msix_entries; + for (i = 0; i < assigned_dev->entries_nr; i++) { + if (!(guest_entries[i].flags & + KVM_ASSIGNED_MSIX_PENDING)) + continue; + guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, + guest_entries[i].vector, 1); + } + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + assigned_dev->guest_irq, 1); + + spin_unlock_irq(&assigned_dev->assigned_dev_lock); +} + +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + unsigned long flags; + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int index = find_index_from_host_irq(assigned_dev, irq); + if (index < 0) + goto out; + assigned_dev->guest_msix_entries[index].flags |= + KVM_ASSIGNED_MSIX_PENDING; + } + + schedule_work(&assigned_dev->interrupt_work); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + disable_irq_nosync(irq); + assigned_dev->host_irq_disabled = true; + } + +out: + spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + unsigned long flags; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); + + /* The guest irq may be shared so this ack may be + * from another device. + */ + spin_lock_irqsave(&dev->assigned_dev_lock, flags); + if (dev->host_irq_disabled) { + enable_irq(dev->host_irq); + dev->host_irq_disabled = false; + } + spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); +} + +static void deassign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + assigned_dev->ack_notifier.gsi = -1; + + if (assigned_dev->irq_source_id != -1) + kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); + assigned_dev->irq_source_id = -1; + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); +} + +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ +static void deassign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + /* + * In kvm_free_device_irq, cancel_work_sync return true if: + * 1. work is scheduled, and then cancelled. + * 2. work callback is executed. + * + * The first one ensured that the irq is disabled and no more events + * would happen. But for the second one, the irq may be enabled (e.g. + * for MSI). So we disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * interrupt_work done. + */ + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + disable_irq_nosync(assigned_dev-> + host_msix_entries[i].vector); + + cancel_work_sync(&assigned_dev->interrupt_work); + + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq(assigned_dev->host_msix_entries[i].vector, + (void *)assigned_dev); + + assigned_dev->entries_nr = 0; + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } else { + /* Deal with MSI and INTx */ + disable_irq_nosync(assigned_dev->host_irq); + cancel_work_sync(&assigned_dev->interrupt_work); + + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) + pci_disable_msi(assigned_dev->dev); + } + + assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); +} + +static int kvm_deassign_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev, + unsigned long irq_requested_type) +{ + unsigned long guest_irq_type, host_irq_type; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + /* no irq assignment to deassign */ + if (!assigned_dev->irq_requested_type) + return -ENXIO; + + host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; + guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; + + if (host_irq_type) + deassign_host_irq(kvm, assigned_dev); + if (guest_irq_type) + deassign_guest_irq(kvm, assigned_dev); + + return 0; +} + +static void kvm_free_assigned_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); +} + +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + kvm_free_assigned_irq(kvm, assigned_dev); + + pci_reset_function(assigned_dev->dev); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + +static int assigned_device_enable_host_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + dev->host_irq = dev->dev->irq; + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, + 0, "kvm_assigned_intx_device", (void *)dev)) + return -EIO; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_host_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int r; + + if (!dev->dev->msi_enabled) { + r = pci_enable_msi(dev->dev); + if (r) + return r; + } + + dev->host_irq = dev->dev->irq; + if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_msi_device", (void *)dev)) { + pci_disable_msi(dev->dev); + return -EIO; + } + + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_host_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev) +{ + int i, r = -EINVAL; + + /* host_msix_entries and guest_msix_entries should have been + * initialized */ + if (dev->entries_nr == 0) + return r; + + r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); + if (r) + return r; + + for (i = 0; i < dev->entries_nr; i++) { + r = request_irq(dev->host_msix_entries[i].vector, + kvm_assigned_dev_intr, 0, + "kvm_assigned_msix_device", + (void *)dev); + /* FIXME: free requested_irq's on failure */ + if (r) + return r; + } + + return 0; +} + +#endif + +static int assigned_device_enable_guest_intx(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = irq->guest_irq; + return 0; +} + +#ifdef __KVM_HAVE_MSI +static int assigned_device_enable_guest_msi(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +#ifdef __KVM_HAVE_MSIX +static int assigned_device_enable_guest_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq) +{ + dev->guest_irq = irq->guest_irq; + dev->ack_notifier.gsi = -1; + dev->host_irq_disabled = false; + return 0; +} +#endif + +static int assign_host_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + __u32 host_irq_type) +{ + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) + return r; + + switch (host_irq_type) { + case KVM_DEV_IRQ_HOST_INTX: + r = assigned_device_enable_host_intx(kvm, dev); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_HOST_MSI: + r = assigned_device_enable_host_msi(kvm, dev); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_HOST_MSIX: + r = assigned_device_enable_host_msix(kvm, dev); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) + dev->irq_requested_type |= host_irq_type; + + return r; +} + +static int assign_guest_irq(struct kvm *kvm, + struct kvm_assigned_dev_kernel *dev, + struct kvm_assigned_irq *irq, + unsigned long guest_irq_type) +{ + int id; + int r = -EEXIST; + + if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) + return r; + + id = kvm_request_irq_source_id(kvm); + if (id < 0) + return id; + + dev->irq_source_id = id; + + switch (guest_irq_type) { + case KVM_DEV_IRQ_GUEST_INTX: + r = assigned_device_enable_guest_intx(kvm, dev, irq); + break; +#ifdef __KVM_HAVE_MSI + case KVM_DEV_IRQ_GUEST_MSI: + r = assigned_device_enable_guest_msi(kvm, dev, irq); + break; +#endif +#ifdef __KVM_HAVE_MSIX + case KVM_DEV_IRQ_GUEST_MSIX: + r = assigned_device_enable_guest_msix(kvm, dev, irq); + break; +#endif + default: + r = -EINVAL; + } + + if (!r) { + dev->irq_requested_type |= guest_irq_type; + kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); + } else + kvm_free_irq_source_id(kvm, dev->irq_source_id); + + return r; +} + +/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq *assigned_irq) +{ + int r = -EINVAL; + struct kvm_assigned_dev_kernel *match; + unsigned long host_irq_type, guest_irq_type; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(&kvm->lock); + r = -ENODEV; + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); + guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); + + r = -EINVAL; + /* can only assign one type at a time */ + if (hweight_long(host_irq_type) > 1) + goto out; + if (hweight_long(guest_irq_type) > 1) + goto out; + if (host_irq_type == 0 && guest_irq_type == 0) + goto out; + + r = 0; + if (host_irq_type) + r = assign_host_irq(kvm, match, host_irq_type); + if (r) + goto out; + + if (guest_irq_type) + r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = -ENODEV; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) + goto out; + + r = kvm_deassign_irq(kvm, match, assigned_irq->flags); +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + down_read(&kvm->slots_lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EEXIST; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + + pci_reset_function(dev); + + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->flags = assigned_dev->flags; + match->dev = dev; + spin_lock_init(&match->assigned_dev_lock); + match->irq_source_id = -1; + match->kvm = kvm; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + list_add(&match->list, &kvm->arch.assigned_dev_head); + + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); + if (r) + goto out_list_del; + } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; + } + +out: + up_read(&kvm->slots_lock); + mutex_unlock(&kvm->lock); + return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + up_read(&kvm->slots_lock); + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + printk(KERN_INFO "%s: device hasn't been assigned before, " + "so cannot be deassigned\n", __func__); + r = -EINVAL; + goto out; + } + + if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) + kvm_deassign_device(kvm, match); + + kvm_free_assigned_device(kvm, match); + +out: + mutex_unlock(&kvm->lock); + return r; +} + + +#ifdef __KVM_HAVE_MSIX +static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, + struct kvm_assigned_msix_nr *entry_nr) +{ + int r = 0; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry_nr->assigned_dev_id); + if (!adev) { + r = -EINVAL; + goto msix_nr_out; + } + + if (adev->entries_nr == 0) { + adev->entries_nr = entry_nr->entry_nr; + if (adev->entries_nr == 0 || + adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto msix_nr_out; + } + + adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * + entry_nr->entry_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + r = -ENOMEM; + goto msix_nr_out; + } + adev->guest_msix_entries = kzalloc( + sizeof(struct kvm_guest_msix_entry) * + entry_nr->entry_nr, GFP_KERNEL); + if (!adev->guest_msix_entries) { + kfree(adev->host_msix_entries); + r = -ENOMEM; + goto msix_nr_out; + } + } else /* Not allowed set MSI-X number twice */ + r = -EINVAL; +msix_nr_out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, + struct kvm_assigned_msix_entry *entry) +{ + int r = 0, i; + struct kvm_assigned_dev_kernel *adev; + + mutex_lock(&kvm->lock); + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + entry->assigned_dev_id); + + if (!adev) { + r = -EINVAL; + goto msix_entry_out; + } + + for (i = 0; i < adev->entries_nr; i++) + if (adev->guest_msix_entries[i].vector == 0 || + adev->guest_msix_entries[i].entry == entry->entry) { + adev->guest_msix_entries[i].entry = entry->entry; + adev->guest_msix_entries[i].vector = entry->gsi; + adev->host_msix_entries[i].entry = entry->entry; + break; + } + if (i == adev->entries_nr) { + r = -ENOSPC; + goto msix_entry_out; + } + +msix_entry_out: + mutex_unlock(&kvm->lock); + + return r; +} +#endif + +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + int r = -ENOTTY; + + switch (ioctl) { + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + r = -EOPNOTSUPP; + break; + } +#ifdef KVM_CAP_ASSIGN_DEV_IRQ + case KVM_ASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } + case KVM_DEASSIGN_DEV_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } +#endif +#ifdef KVM_CAP_DEVICE_DEASSIGNMENT + case KVM_DEASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } +#endif +#ifdef KVM_CAP_IRQ_ROUTING + case KVM_SET_GSI_ROUTING: { + struct kvm_irq_routing routing; + struct kvm_irq_routing __user *urouting; + struct kvm_irq_routing_entry *entries; + + r = -EFAULT; + if (copy_from_user(&routing, argp, sizeof(routing))) + goto out; + r = -EINVAL; + if (routing.nr >= KVM_MAX_IRQ_ROUTES) + goto out; + if (routing.flags) + goto out; + r = -ENOMEM; + entries = vmalloc(routing.nr * sizeof(*entries)); + if (!entries) + goto out; + r = -EFAULT; + urouting = argp; + if (copy_from_user(entries, urouting->entries, + routing.nr * sizeof(*entries))) + goto out_free_irq_routing; + r = kvm_set_irq_routing(kvm, entries, routing.nr, + routing.flags); + out_free_irq_routing: + vfree(entries); + break; + } +#endif /* KVM_CAP_IRQ_ROUTING */ +#ifdef __KVM_HAVE_MSIX + case KVM_ASSIGN_SET_MSIX_NR: { + struct kvm_assigned_msix_nr entry_nr; + r = -EFAULT; + if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) + goto out; + r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); + if (r) + goto out; + break; + } + case KVM_ASSIGN_SET_MSIX_ENTRY: { + struct kvm_assigned_msix_entry entry; + r = -EFAULT; + if (copy_from_user(&entry, argp, sizeof entry)) + goto out; + r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + if (r) + goto out; + break; + } +#endif + } +out: + return r; +} + diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ae620d32fac..04d69cd7049b 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -14,32 +14,28 @@ #include "coalesced_mmio.h" -static int coalesced_mmio_in_range(struct kvm_io_device *this, - gpa_t addr, int len, int is_write) +static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_coalesced_mmio_dev, dev); +} + +static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, + gpa_t addr, int len) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; struct kvm_coalesced_mmio_zone *zone; - int next; + struct kvm_coalesced_mmio_ring *ring; + unsigned avail; int i; - if (!is_write) - return 0; - - /* kvm->lock is taken by the caller and must be not released before - * dev.read/write - */ - /* Are we able to batch it ? */ /* last is the first free entry * check if we don't meet the first used entry * there is always one unused entry in the buffer */ - - next = (dev->kvm->coalesced_mmio_ring->last + 1) % - KVM_COALESCED_MMIO_MAX; - if (next == dev->kvm->coalesced_mmio_ring->first) { + ring = dev->kvm->coalesced_mmio_ring; + avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; + if (avail < KVM_MAX_VCPUS) { /* full */ return 0; } @@ -60,14 +56,15 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this, return 0; } -static void coalesced_mmio_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *val) +static int coalesced_mmio_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) { - struct kvm_coalesced_mmio_dev *dev = - (struct kvm_coalesced_mmio_dev*)this->private; + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; + if (!coalesced_mmio_in_range(dev, addr, len)) + return -EOPNOTSUPP; - /* kvm->lock must be taken by caller before call to in_range()*/ + spin_lock(&dev->lock); /* copy data in first free entry of the ring */ @@ -76,29 +73,40 @@ static void coalesced_mmio_write(struct kvm_io_device *this, memcpy(ring->coalesced_mmio[ring->last].data, val, len); smp_wmb(); ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; + spin_unlock(&dev->lock); + return 0; } static void coalesced_mmio_destructor(struct kvm_io_device *this) { - kfree(this); + struct kvm_coalesced_mmio_dev *dev = to_mmio(this); + + kfree(dev); } +static const struct kvm_io_device_ops coalesced_mmio_ops = { + .write = coalesced_mmio_write, + .destructor = coalesced_mmio_destructor, +}; + int kvm_coalesced_mmio_init(struct kvm *kvm) { struct kvm_coalesced_mmio_dev *dev; + int ret; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; - dev->dev.write = coalesced_mmio_write; - dev->dev.in_range = coalesced_mmio_in_range; - dev->dev.destructor = coalesced_mmio_destructor; - dev->dev.private = dev; + spin_lock_init(&dev->lock); + kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; kvm->coalesced_mmio_dev = dev; - kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev); - return 0; + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev); + if (ret < 0) + kfree(dev); + + return ret; } int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, @@ -109,16 +117,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return -ENOBUFS; } dev->zone[dev->nb_zones] = *zone; dev->nb_zones++; - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } @@ -132,7 +140,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, if (dev == NULL) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(&kvm->slots_lock); i = dev->nb_zones; while(i) { @@ -150,7 +158,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, i--; } - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h index 5ac0ec628461..4b49f27fa31e 100644 --- a/virt/kvm/coalesced_mmio.h +++ b/virt/kvm/coalesced_mmio.h @@ -12,6 +12,7 @@ struct kvm_coalesced_mmio_dev { struct kvm_io_device dev; struct kvm *kvm; + spinlock_t lock; int nb_zones; struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; }; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c new file mode 100644 index 000000000000..a9d3fc6c681c --- /dev/null +++ b/virt/kvm/eventfd.c @@ -0,0 +1,588 @@ +/* + * kvm eventfd support - use eventfd objects to signal various KVM events + * + * Copyright 2009 Novell. All Rights Reserved. + * + * Author: + * Gregory Haskins <ghaskins@novell.com> + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/workqueue.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/list.h> +#include <linux/eventfd.h> +#include <linux/kernel.h> + +#include "iodev.h" + +/* + * -------------------------------------------------------------------- + * irqfd: Allows an fd to be used to inject an interrupt to the guest + * + * Credit goes to Avi Kivity for the original idea. + * -------------------------------------------------------------------- + */ + +struct _irqfd { + struct kvm *kvm; + struct eventfd_ctx *eventfd; + int gsi; + struct list_head list; + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct inject; + struct work_struct shutdown; +}; + +static struct workqueue_struct *irqfd_cleanup_wq; + +static void +irqfd_inject(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + struct kvm *kvm = irqfd->kvm; + + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void +irqfd_shutdown(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); + u64 cnt; + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); + + /* + * We know no new events will be scheduled at this point, so block + * until all previously outstanding events have completed + */ + flush_work(&irqfd->inject); + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->eventfd); + kfree(irqfd); +} + + +/* assumes kvm->irqfds.lock is held */ +static bool +irqfd_is_active(struct _irqfd *irqfd) +{ + return list_empty(&irqfd->list) ? false : true; +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes kvm->irqfds.lock is held + */ +static void +irqfd_deactivate(struct _irqfd *irqfd) +{ + BUG_ON(!irqfd_is_active(irqfd)); + + list_del_init(&irqfd->list); + + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int +irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); + unsigned long flags = (unsigned long)key; + + if (flags & POLLIN) + /* An event has been signaled, inject an interrupt */ + schedule_work(&irqfd->inject); + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from KVM */ + struct kvm *kvm = irqfd->kvm; + unsigned long flags; + + spin_lock_irqsave(&kvm->irqfds.lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the irqfds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (irqfd_is_active(irqfd)) + irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + } + + return 0; +} + +static void +irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + + irqfd->wqh = wqh; + add_wait_queue(wqh, &irqfd->wait); +} + +static int +kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + int ret; + unsigned int events; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->kvm = kvm; + irqfd->gsi = gsi; + INIT_LIST_HEAD(&irqfd->list); + INIT_WORK(&irqfd->inject, irqfd_inject); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); + + file = eventfd_fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->eventfd = eventfd; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); + init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); + + spin_lock_irq(&kvm->irqfds.lock); + + ret = 0; + list_for_each_entry(tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd != tmp->eventfd) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&kvm->irqfds.lock); + goto fail; + } + + events = file->f_op->poll(file, &irqfd->pt); + + list_add_tail(&irqfd->list, &kvm->irqfds.items); + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + if (events & POLLIN) + schedule_work(&irqfd->inject); + + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fput(file); + + return 0; + +fail: + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (!IS_ERR(file)) + fput(file); + + kfree(irqfd); + return ret; +} + +void +kvm_eventfd_init(struct kvm *kvm) +{ + spin_lock_init(&kvm->irqfds.lock); + INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->ioeventfds); +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int +kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +{ + struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) + irqfd_deactivate(irqfd); + } + + spin_unlock_irq(&kvm->irqfds.lock); + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +{ + if (flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, fd, gsi); + + return kvm_irqfd_assign(kvm, fd, gsi); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * irqfds that still remain open + */ +void +kvm_irqfd_release(struct kvm *kvm) +{ + struct _irqfd *irqfd, *tmp; + + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) + irqfd_deactivate(irqfd); + + spin_unlock_irq(&kvm->irqfds.lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a kvm* reference. + */ + flush_workqueue(irqfd_cleanup_wq); + +} + +/* + * create a host-wide workqueue for issuing deferred shutdown requests + * aggregated from all vm* instances. We need our own isolated single-thread + * queue to prevent deadlock against flushing the normal work-queue. + */ +static int __init irqfd_module_init(void) +{ + irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit irqfd_module_exit(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +module_init(irqfd_module_init); +module_exit(irqfd_module_exit); + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * + * userspace can register a PIO/MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +struct _ioeventfd { + struct list_head list; + u64 addr; + int length; + struct eventfd_ctx *eventfd; + u64 datamatch; + struct kvm_io_device dev; + bool wildcard; +}; + +static inline struct _ioeventfd * +to_ioeventfd(struct kvm_io_device *dev) +{ + return container_of(dev, struct _ioeventfd, dev); +} + +static void +ioeventfd_release(struct _ioeventfd *p) +{ + eventfd_ctx_put(p->eventfd); + list_del(&p->list); + kfree(p); +} + +static bool +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) +{ + u64 _val; + + if (!(addr == p->addr && len == p->length)) + /* address-range must be precise for a hit */ + return false; + + if (p->wildcard) + /* all else equal, wildcard is always a hit */ + return true; + + /* otherwise, we have to actually compare the data */ + + BUG_ON(!IS_ALIGNED((unsigned long)val, len)); + + switch (len) { + case 1: + _val = *(u8 *)val; + break; + case 2: + _val = *(u16 *)val; + break; + case 4: + _val = *(u32 *)val; + break; + case 8: + _val = *(u64 *)val; + break; + default: + return false; + } + + return _val == p->datamatch ? true : false; +} + +/* MMIO/PIO writes trigger an event if the addr/val match */ +static int +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + if (!ioeventfd_in_range(p, addr, len, val)) + return -EOPNOTSUPP; + + eventfd_signal(p->eventfd, 1); + return 0; +} + +/* + * This function is called as KVM is completely shutting down. We do not + * need to worry about locking just nuke anything we have as quickly as possible + */ +static void +ioeventfd_destructor(struct kvm_io_device *this) +{ + struct _ioeventfd *p = to_ioeventfd(this); + + ioeventfd_release(p); +} + +static const struct kvm_io_device_ops ioeventfd_ops = { + .write = ioeventfd_write, + .destructor = ioeventfd_destructor, +}; + +/* assumes kvm->slots_lock held */ +static bool +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) +{ + struct _ioeventfd *_p; + + list_for_each_entry(_p, &kvm->ioeventfds, list) + if (_p->addr == p->addr && _p->length == p->length && + (_p->wildcard || p->wildcard || + _p->datamatch == p->datamatch)) + return true; + + return false; +} + +static int +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p; + struct eventfd_ctx *eventfd; + int ret; + + /* must be natural-word sized */ + switch (args->len) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&p->list); + p->addr = args->addr; + p->length = args->len; + p->eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) + p->datamatch = args->datamatch; + else + p->wildcard = true; + + down_write(&kvm->slots_lock); + + /* Verify that there isnt a match already */ + if (ioeventfd_check_collision(kvm, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + kvm_iodevice_init(&p->dev, &ioeventfd_ops); + + ret = __kvm_io_bus_register_dev(bus, &p->dev); + if (ret < 0) + goto unlock_fail; + + list_add_tail(&p->list, &kvm->ioeventfds); + + up_write(&kvm->slots_lock); + + return 0; + +unlock_fail: + up_write(&kvm->slots_lock); + +fail: + kfree(p); + eventfd_ctx_put(eventfd); + + return ret; +} + +static int +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; + struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; + struct _ioeventfd *p, *tmp; + struct eventfd_ctx *eventfd; + int ret = -ENOENT; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + down_write(&kvm->slots_lock); + + list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { + bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + + if (p->eventfd != eventfd || + p->addr != args->addr || + p->length != args->len || + p->wildcard != wildcard) + continue; + + if (!p->wildcard && p->datamatch != args->datamatch) + continue; + + __kvm_io_bus_unregister_dev(bus, &p->dev); + ioeventfd_release(p); + ret = 0; + break; + } + + up_write(&kvm->slots_lock); + + eventfd_ctx_put(eventfd); + + return ret; +} + +int +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) +{ + if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) + return kvm_deassign_ioeventfd(kvm, args); + + return kvm_assign_ioeventfd(kvm, args); +} diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1150c6d5c7b8..38a2d20b89de 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -36,6 +36,7 @@ #include <asm/processor.h> #include <asm/page.h> #include <asm/current.h> +#include <trace/events/kvm.h> #include "ioapic.h" #include "lapic.h" @@ -103,6 +104,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + union kvm_ioapic_redirect_entry *e; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -122,19 +124,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; - mask_before = ioapic->redirtbl[index].fields.mask; + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; if (ioapic->ioregsel & 1) { - ioapic->redirtbl[index].bits &= 0xffffffff; - ioapic->redirtbl[index].bits |= (u64) val << 32; + e->bits &= 0xffffffff; + e->bits |= (u64) val << 32; } else { - ioapic->redirtbl[index].bits &= ~0xffffffffULL; - ioapic->redirtbl[index].bits |= (u32) val; - ioapic->redirtbl[index].fields.remote_irr = 0; + e->bits &= ~0xffffffffULL; + e->bits |= (u32) val; + e->fields.remote_irr = 0; } - mask_after = ioapic->redirtbl[index].fields.mask; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); - if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index); break; @@ -164,7 +167,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) /* Always delivery PIT interrupt to vcpu 0 */ if (irq == 0) { irqe.dest_mode = 0; /* Physical mode. */ - irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; + /* need to read apic_id from apic regiest since + * it can be rewritten */ + irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id; } #endif return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); @@ -177,6 +182,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) union kvm_ioapic_redirect_entry entry; int ret = 1; + mutex_lock(&ioapic->lock); if (irq >= 0 && irq < IOAPIC_NUM_PINS) { entry = ioapic->redirtbl[irq]; level ^= entry.fields.polarity; @@ -188,57 +194,82 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) if ((edge && old_irr != ioapic->irr) || (!edge && !entry.fields.remote_irr)) ret = ioapic_service(ioapic, irq); + else + ret = 0; /* report coalesced interrupt */ } + trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); } + mutex_unlock(&ioapic->lock); + return ret; } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin, - int trigger_mode) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, + int trigger_mode) { - union kvm_ioapic_redirect_entry *ent; + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; - ent = &ioapic->redirtbl[pin]; + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + mutex_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); + mutex_lock(&ioapic->lock); - kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin); + if (trigger_mode != IOAPIC_LEVEL_TRIG) + continue; - if (trigger_mode == IOAPIC_LEVEL_TRIG) { ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << pin))) - ioapic_service(ioapic, pin); + if (!ent->fields.mask && (ioapic->irr & (1 << i))) + ioapic_service(ioapic, i); } } void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - int i; - for (i = 0; i < IOAPIC_NUM_PINS; i++) - if (ioapic->redirtbl[i].fields.vector == vector) - __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); + mutex_lock(&ioapic->lock); + __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); + mutex_unlock(&ioapic->lock); } -static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + return container_of(dev, struct kvm_ioapic, dev); +} +static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr) +{ return ((addr >= ioapic->base_address && (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); } -static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, - void *val) +static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 result; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; + mutex_lock(&ioapic->lock); switch (addr) { case IOAPIC_REG_SELECT: result = ioapic->ioregsel; @@ -252,6 +283,8 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, result = 0; break; } + mutex_unlock(&ioapic->lock); + switch (len) { case 8: *(u64 *) val = result; @@ -264,25 +297,30 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, default: printk(KERN_WARNING "ioapic: wrong length %d\n", len); } + return 0; } -static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, - const void *val) +static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) { - struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + struct kvm_ioapic *ioapic = to_ioapic(this); u32 data; + if (!ioapic_in_range(ioapic, addr)) + return -EOPNOTSUPP; ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ + if (len == 4 || len == 8) data = *(u32 *) val; else { printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); - return; + return 0; } addr &= 0xff; + mutex_lock(&ioapic->lock); switch (addr) { case IOAPIC_REG_SELECT: ioapic->ioregsel = data; @@ -293,13 +331,15 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, break; #ifdef CONFIG_IA64 case IOAPIC_REG_EOI: - kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG); + __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); break; #endif default: break; } + mutex_unlock(&ioapic->lock); + return 0; } void kvm_ioapic_reset(struct kvm_ioapic *ioapic) @@ -314,21 +354,51 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->id = 0; } +static const struct kvm_io_device_ops ioapic_mmio_ops = { + .read = ioapic_mmio_read, + .write = ioapic_mmio_write, +}; + int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; + int ret; ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); if (!ioapic) return -ENOMEM; + mutex_init(&ioapic->lock); kvm->arch.vioapic = ioapic; kvm_ioapic_reset(ioapic); - ioapic->dev.read = ioapic_mmio_read; - ioapic->dev.write = ioapic_mmio_write; - ioapic->dev.in_range = ioapic_in_range; - ioapic->dev.private = ioapic; + kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); ioapic->kvm = kvm; - kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); + ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev); + if (ret < 0) + kfree(ioapic); + + return ret; +} + +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + mutex_lock(&ioapic->lock); + memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); + mutex_unlock(&ioapic->lock); return 0; } +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +{ + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + if (!ioapic) + return -EINVAL; + + mutex_lock(&ioapic->lock); + memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); + mutex_unlock(&ioapic->lock); + return 0; +} diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 7080b713c160..419c43b667ab 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -41,9 +41,11 @@ struct kvm_ioapic { u32 irr; u32 pad; union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; + unsigned long irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); + struct mutex lock; }; #ifdef DEBUG @@ -73,4 +75,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq); +int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); + #endif diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 55e8846ac3a6..12fd3caffd2b 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -17,49 +17,54 @@ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> +#include <asm/errno.h> -struct kvm_io_device { - void (*read)(struct kvm_io_device *this, +struct kvm_io_device; + +/** + * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. + **/ +struct kvm_io_device_ops { + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, gpa_t addr, int len, - void *val); - void (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len, - int is_write); + const void *val); void (*destructor)(struct kvm_io_device *this); +}; - void *private; + +struct kvm_io_device { + const struct kvm_io_device_ops *ops; }; -static inline void kvm_iodevice_read(struct kvm_io_device *dev, - gpa_t addr, - int len, - void *val) +static inline void kvm_iodevice_init(struct kvm_io_device *dev, + const struct kvm_io_device_ops *ops) { - dev->read(dev, addr, len, val); + dev->ops = ops; } -static inline void kvm_iodevice_write(struct kvm_io_device *dev, - gpa_t addr, - int len, - const void *val) +static inline int kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, int l, void *v) { - dev->write(dev, addr, len, val); + return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP; } -static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, - gpa_t addr, int len, int is_write) +static inline int kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, int l, const void *v) { - return dev->in_range(dev, addr, len, is_write); + return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP; } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { - if (dev->destructor) - dev->destructor(dev); + if (dev->ops->destructor) + dev->ops->destructor(dev); } #endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index ddc17f0e2f35..9fd5b3ebc517 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,6 +20,7 @@ */ #include <linux/kvm_host.h> +#include <trace/events/kvm.h> #include <asm/msidef.h> #ifdef CONFIG_IA64 @@ -30,20 +31,39 @@ #include "ioapic.h" +static inline int kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) + struct kvm *kvm, int irq_source_id, int level) { #ifdef CONFIG_X86 - return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level); + struct kvm_pic *pic = pic_irqchip(kvm); + level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin], + irq_source_id, level); + return kvm_pic_set_irq(pic, e->irqchip.pin, level); #else return -1; #endif } static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) + struct kvm *kvm, int irq_source_id, int level) { - return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level); + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin], + irq_source_id, level); + + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level); } inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) @@ -66,10 +86,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, kvm_is_dm_lowest_prio(irq)) printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); - for (i = 0; i < KVM_MAX_VCPUS; i++) { - vcpu = kvm->vcpus[i]; - - if (!vcpu || !kvm_apic_present(vcpu)) + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, @@ -95,10 +113,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, } static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int level) + struct kvm *kvm, int irq_source_id, int level) { struct kvm_lapic_irq irq; + if (!level) + return -1; + + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + irq.dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; irq.vector = (e->msi.data & @@ -113,90 +136,97 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } -/* This should be called with the kvm->lock mutex held +/* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to */ -int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) { - struct kvm_kernel_irq_routing_entry *e; - unsigned long *irq_state, sig_level; - int ret = -1; - - if (irq < KVM_IOAPIC_NUM_PINS) { - irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; - - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); - sig_level = !!(*irq_state); - } else /* Deal with MSI/MSI-X */ - sig_level = 1; + struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; + int ret = -1, i = 0; + struct kvm_irq_routing_table *irq_rt; + struct hlist_node *n; + + trace_kvm_set_irq(irq, level, irq_source_id); /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - list_for_each_entry(e, &kvm->irq_routing, link) - if (e->gsi == irq) { - int r = e->set(e, kvm, sig_level); - if (r < 0) - continue; + rcu_read_lock(); + irq_rt = rcu_dereference(kvm->irq_routing); + if (irq < irq_rt->nr_rt_entries) + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) + irq_set[i++] = *e; + rcu_read_unlock(); + + while(i--) { + int r; + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } - ret = r + ((ret < 0) ? 0 : ret); - } return ret; } void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { - struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_ack_notifier *kian; struct hlist_node *n; - unsigned gsi = pin; - - list_for_each_entry(e, &kvm->irq_routing, link) - if (e->type == KVM_IRQ_ROUTING_IRQCHIP && - e->irqchip.irqchip == irqchip && - e->irqchip.pin == pin) { - gsi = e->gsi; - break; - } - - hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) - if (kian->gsi == gsi) - kian->irq_acked(kian); + int gsi; + + trace_kvm_ack_irq(irqchip, pin); + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, n, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); + rcu_read_unlock(); } void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { - hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); + mutex_lock(&kvm->irq_lock); + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); } -void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) { - hlist_del_init(&kian->link); + mutex_lock(&kvm->irq_lock); + hlist_del_init_rcu(&kian->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); } -/* The caller must hold kvm->lock mutex */ int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; - int irq_source_id = find_first_zero_bit(bitmap, - sizeof(kvm->arch.irq_sources_bitmap)); + int irq_source_id; + + mutex_lock(&kvm->irq_lock); + irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); - if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { + if (irq_source_id >= BITS_PER_LONG) { printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); - return -EFAULT; + irq_source_id = -EFAULT; + goto unlock; } ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); set_bit(irq_source_id, bitmap); +unlock: + mutex_unlock(&kvm->irq_lock); return irq_source_id; } @@ -207,27 +237,44 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); + mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || - irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { + irq_source_id >= BITS_PER_LONG) { printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); - return; + goto unlock; } - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) - clear_bit(irq_source_id, &kvm->arch.irq_states[i]); clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); + if (!irqchip_in_kernel(kvm)) + goto unlock; + + for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) { + clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]); + if (i >= 16) + continue; +#ifdef CONFIG_X86 + clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]); +#endif + } +unlock: + mutex_unlock(&kvm->irq_lock); } void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { + mutex_lock(&kvm->irq_lock); kimn->irq = irq; - hlist_add_head(&kimn->link, &kvm->mask_notifier_list); + hlist_add_head_rcu(&kimn->link, &kvm->mask_notifier_list); + mutex_unlock(&kvm->irq_lock); } void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { - hlist_del(&kimn->link); + mutex_lock(&kvm->irq_lock); + hlist_del_rcu(&kimn->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); } void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) @@ -235,29 +282,38 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) struct kvm_irq_mask_notifier *kimn; struct hlist_node *n; - hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) + rcu_read_lock(); + hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) if (kimn->irq == irq) kimn->func(kimn, mask); -} - -static void __kvm_free_irq_routing(struct list_head *irq_routing) -{ - struct kvm_kernel_irq_routing_entry *e, *n; - - list_for_each_entry_safe(e, n, irq_routing, link) - kfree(e); + rcu_read_unlock(); } void kvm_free_irq_routing(struct kvm *kvm) { - __kvm_free_irq_routing(&kvm->irq_routing); + /* Called only during vm destruction. Nobody can use the pointer + at this stage */ + kfree(kvm->irq_routing); } -static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, +static int setup_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; int delta; + unsigned max_pin; + struct kvm_kernel_irq_routing_entry *ei; + struct hlist_node *n; + + /* + * Do not allow GSI to be mapped to the same irqchip more than once. + * Allow only one to one mapping between GSI and MSI. + */ + hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link) + if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->u.irqchip.irqchip == ei->irqchip.irqchip) + return r; e->gsi = ue->gsi; e->type = ue->type; @@ -267,12 +323,15 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_MASTER: e->set = kvm_set_pic_irq; + max_pin = 16; break; case KVM_IRQCHIP_PIC_SLAVE: e->set = kvm_set_pic_irq; + max_pin = 16; delta = 8; break; case KVM_IRQCHIP_IOAPIC: + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; default: @@ -280,6 +339,9 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, } e->irqchip.irqchip = ue->u.irqchip.irqchip; e->irqchip.pin = ue->u.irqchip.pin + delta; + if (e->irqchip.pin >= max_pin) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; @@ -290,6 +352,8 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, default: goto out; } + + hlist_add_head(&e->link, &rt->map[e->gsi]); r = 0; out: return r; @@ -301,43 +365,53 @@ int kvm_set_irq_routing(struct kvm *kvm, unsigned nr, unsigned flags) { - struct list_head irq_list = LIST_HEAD_INIT(irq_list); - struct list_head tmp = LIST_HEAD_INIT(tmp); - struct kvm_kernel_irq_routing_entry *e = NULL; - unsigned i; + struct kvm_irq_routing_table *new, *old; + u32 i, j, nr_rt_entries = 0; int r; for (i = 0; i < nr; ++i) { + if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + + nr_rt_entries += 1; + + new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) + + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), + GFP_KERNEL); + + if (!new) + return -ENOMEM; + + new->rt_entries = (void *)&new->map[nr_rt_entries]; + + new->nr_rt_entries = nr_rt_entries; + for (i = 0; i < 3; i++) + for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) + new->chip[i][j] = -1; + + for (i = 0; i < nr; ++i) { r = -EINVAL; - if (ue->gsi >= KVM_MAX_IRQ_ROUTES) - goto out; if (ue->flags) goto out; - r = -ENOMEM; - e = kzalloc(sizeof(*e), GFP_KERNEL); - if (!e) - goto out; - r = setup_routing_entry(e, ue); + r = setup_routing_entry(new, &new->rt_entries[i], ue); if (r) goto out; ++ue; - list_add(&e->link, &irq_list); - e = NULL; } - mutex_lock(&kvm->lock); - list_splice(&kvm->irq_routing, &tmp); - INIT_LIST_HEAD(&kvm->irq_routing); - list_splice(&irq_list, &kvm->irq_routing); - INIT_LIST_HEAD(&irq_list); - list_splice(&tmp, &irq_list); - mutex_unlock(&kvm->lock); + mutex_lock(&kvm->irq_lock); + old = kvm->irq_routing; + rcu_assign_pointer(kvm->irq_routing, new); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); + new = old; r = 0; out: - kfree(e); - __kvm_free_irq_routing(&irq_list); + kfree(new); return r; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b24e96d5d40c..a51ba60a78b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -43,29 +43,36 @@ #include <linux/swap.h> #include <linux/bitops.h> #include <linux/spinlock.h> +#include <linux/compat.h> #include <asm/processor.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm-generic/bitops/le.h> #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET #include "coalesced_mmio.h" #endif -#ifdef KVM_CAP_DEVICE_ASSIGNMENT -#include <linux/pci.h> -#include <linux/interrupt.h> -#include "irq.h" -#endif +#define CREATE_TRACE_POINTS +#include <trace/events/kvm.h> MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +/* + * Ordering of locks: + * + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + DEFINE_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; +static int kvm_usage_count = 0; +static atomic_t hardware_enable_failed; struct kmem_cache *kvm_vcpu_cache; EXPORT_SYMBOL_GPL(kvm_vcpu_cache); @@ -76,627 +83,12 @@ struct dentry *kvm_debugfs_dir; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); +static int hardware_enable_all(void); +static void hardware_disable_all(void); static bool kvm_rebooting; -#ifdef KVM_CAP_DEVICE_ASSIGNMENT -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct list_head *ptr; - struct kvm_assigned_dev_kernel *match; - - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) { - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - return 0; - } - - return index; -} - -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) -{ - struct kvm_assigned_dev_kernel *assigned_dev; - struct kvm *kvm; - int irq, i; - - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); - kvm = assigned_dev->kvm; - - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&kvm->lock); - spin_lock_irq(&assigned_dev->assigned_dev_lock); - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - struct kvm_guest_msix_entry *guest_entries = - assigned_dev->guest_msix_entries; - for (i = 0; i < assigned_dev->entries_nr; i++) { - if (!(guest_entries[i].flags & - KVM_ASSIGNED_MSIX_PENDING)) - continue; - guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, - guest_entries[i].vector, 1); - irq = assigned_dev->host_msix_entries[i].vector; - if (irq != 0) - enable_irq(irq); - assigned_dev->host_irq_disabled = false; - } - } else { - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - if (assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_MSI) { - enable_irq(assigned_dev->host_irq); - assigned_dev->host_irq_disabled = false; - } - } - - spin_unlock_irq(&assigned_dev->assigned_dev_lock); - mutex_unlock(&assigned_dev->kvm->lock); -} - -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - unsigned long flags; - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int index = find_index_from_host_irq(assigned_dev, irq); - if (index < 0) - goto out; - assigned_dev->guest_msix_entries[index].flags |= - KVM_ASSIGNED_MSIX_PENDING; - } - - schedule_work(&assigned_dev->interrupt_work); - - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - -out: - spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev; - unsigned long flags; - - if (kian->gsi == -1) - return; - - dev = container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); - - /* The guest irq may be shared so this ack may be - * from another device. - */ - spin_lock_irqsave(&dev->assigned_dev_lock, flags); - if (dev->host_irq_disabled) { - enable_irq(dev->host_irq); - dev->host_irq_disabled = false; - } - spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); - assigned_dev->ack_notifier.gsi = -1; - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * In kvm_free_device_irq, cancel_work_sync return true if: - * 1. work is scheduled, and then cancelled. - * 2. work callback is executed. - * - * The first one ensured that the irq is disabled and no more events - * would happen. But for the second one, the irq may be enabled (e.g. - * for MSI). So we disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * interrupt_work done. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq_nosync(assigned_dev-> - host_msix_entries[i].vector); - - cancel_work_sync(&assigned_dev->interrupt_work); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - (void *)assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - disable_irq_nosync(assigned_dev->host_irq); - cancel_work_sync(&assigned_dev->interrupt_work); - - free_irq(assigned_dev->host_irq, (void *)assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - dev->host_irq = dev->dev->irq; - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. - */ - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, - 0, "kvm_assigned_intx_device", (void *)dev)) - return -EIO; - return 0; -} - -#ifdef __KVM_HAVE_MSI -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_msi_device", (void *)dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} -#endif - -#ifdef __KVM_HAVE_MSIX -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_intr, 0, - "kvm_assigned_msix_device", - (void *)dev); - /* FIXME: free requested_irq's on failure */ - if (r) - return r; - } - - return 0; -} - -#endif - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -#ifdef __KVM_HAVE_MSI -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} -#endif -#ifdef __KVM_HAVE_MSIX -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} -#endif - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; -#ifdef __KVM_HAVE_MSI - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; -#endif -#ifdef __KVM_HAVE_MSIX - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; -#endif - default: - r = -EINVAL; - } - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; -#ifdef __KVM_HAVE_MSI - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; -#endif -#ifdef __KVM_HAVE_MSIX - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; -#endif - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else - kvm_free_irq_source_id(kvm, dev->irq_source_id); - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - down_read(&kvm->slots_lock); - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_bus_and_slot(assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->assigned_dev_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match); - if (r) - goto out_list_del; - } - -out: - mutex_unlock(&kvm->lock); - up_read(&kvm->slots_lock); - return r; -out_list_del: - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - mutex_unlock(&kvm->lock); - up_read(&kvm->slots_lock); - return r; -} -#endif - -#ifdef KVM_CAP_DEVICE_DEASSIGNMENT -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) - kvm_deassign_device(kvm, match); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} -#endif - -static inline int valid_vcpu(int n) -{ - return likely(n >= 0 && n < KVM_MAX_VCPUS); -} +static bool largepages_enabled = true; inline int kvm_is_mmio_pfn(pfn_t pfn) { @@ -742,15 +134,11 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) bool called = true; struct kvm_vcpu *vcpu; - if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) - cpumask_clear(cpus); + zalloc_cpumask_var(&cpus, GFP_ATOMIC); spin_lock(&kvm->requests_lock); - me = get_cpu(); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + me = raw_smp_processor_id(); + kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(req, &vcpu->requests)) continue; cpu = vcpu->cpu; @@ -763,7 +151,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - put_cpu(); spin_unlock(&kvm->requests_lock); free_cpumask_var(cpus); return called; @@ -859,6 +246,19 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, } +static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + + spin_lock(&kvm->mmu_lock); + kvm->mmu_notifier_seq++; + kvm_set_spte_hva(kvm, address, pte); + spin_unlock(&kvm->mmu_lock); +} + static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -938,12 +338,14 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm *kvm_create_vm(void) { + int r = 0; struct kvm *kvm = kvm_arch_create_vm(); #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET struct page *page; @@ -951,16 +353,21 @@ static struct kvm *kvm_create_vm(void) if (IS_ERR(kvm)) goto out; + + r = hardware_enable_all(); + if (r) + goto out_err_nodisable; + #ifdef CONFIG_HAVE_KVM_IRQCHIP - INIT_LIST_HEAD(&kvm->irq_routing); INIT_HLIST_HEAD(&kvm->mask_notifier_list); + INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); #endif #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { - kfree(kvm); - return ERR_PTR(-ENOMEM); + r = -ENOMEM; + goto out_err; } kvm->coalesced_mmio_ring = (struct kvm_coalesced_mmio_ring *)page_address(page); @@ -968,15 +375,13 @@ static struct kvm *kvm_create_vm(void) #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) { - int err; kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); - if (err) { + r = mmu_notifier_register(&kvm->mmu_notifier, current->mm); + if (r) { #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET put_page(page); #endif - kfree(kvm); - return ERR_PTR(err); + goto out_err; } } #endif @@ -986,7 +391,9 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->mmu_lock); spin_lock_init(&kvm->requests_lock); kvm_io_bus_init(&kvm->pio_bus); + kvm_eventfd_init(kvm); mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); kvm_io_bus_init(&kvm->mmio_bus); init_rwsem(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); @@ -998,6 +405,15 @@ static struct kvm *kvm_create_vm(void) #endif out: return kvm; + +#if defined(KVM_COALESCED_MMIO_PAGE_OFFSET) || \ + (defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)) +out_err: + hardware_disable_all(); +#endif +out_err_nodisable: + kfree(kvm); + return ERR_PTR(r); } /* @@ -1006,19 +422,25 @@ out: static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + int i; + if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); - if (!dont || free->lpage_info != dont->lpage_info) - vfree(free->lpage_info); + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; - free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -1050,6 +472,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow(kvm); #endif kvm_arch_destroy_vm(kvm); + hardware_disable_all(); mmdrop(mm); } @@ -1071,6 +494,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) { struct kvm *kvm = filp->private_data; + kvm_irqfd_release(kvm); + kvm_put_kvm(kvm); return 0; } @@ -1089,8 +514,8 @@ int __kvm_set_memory_region(struct kvm *kvm, { int r; gfn_t base_gfn; - unsigned long npages, ugfn; - unsigned long largepages, i; + unsigned long npages; + unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1164,31 +589,51 @@ int __kvm_set_memory_region(struct kvm *kvm, else new.userspace_addr = 0; } - if (npages && !new.lpage_info) { - largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; - largepages -= base_gfn / KVM_PAGES_PER_HPAGE; + if (!npages) + goto skip_lpage; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + unsigned long j; + int lpages; + int level = i + 2; - new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + /* Avoid unused variable warning if no large pages */ + (void)level; - if (!new.lpage_info) + if (new.lpage_info[i]) + continue; + + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE) - new.lpage_info[0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) - new.lpage_info[largepages-1].write_count = 1; + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, disable large page support for this slot + * other, or if explicitly asked to, disable large page + * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) - for (i = 0; i < largepages; ++i) - new.lpage_info[i].write_count = 1; + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !largepages_enabled) + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; } +skip_lpage: + /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; @@ -1200,6 +645,10 @@ int __kvm_set_memory_region(struct kvm *kvm, if (old.npages) kvm_arch_flush_shadow(kvm); } +#else /* not defined CONFIG_S390 */ + new.user_alloc = user_alloc; + if (user_alloc) + new.userspace_addr = mem->userspace_addr; #endif /* not defined CONFIG_S390 */ if (!npages) @@ -1299,6 +748,12 @@ out: return r; } +void kvm_disable_largepages(void) +{ + largepages_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_largepages); + int is_error_page(struct page *page) { return page == bad_page; @@ -1620,8 +1075,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) unsigned long rel_gfn = gfn - memslot->base_gfn; /* avoid RMW */ - if (!test_bit(rel_gfn, memslot->dirty_bitmap)) - set_bit(rel_gfn, memslot->dirty_bitmap); + if (!generic_test_le_bit(rel_gfn, memslot->dirty_bitmap)) + generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); } } @@ -1635,9 +1090,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if ((kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) || - kvm_arch_vcpu_runnable(vcpu)) { + if (kvm_arch_vcpu_runnable(vcpu)) { set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } @@ -1646,9 +1099,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) if (signal_pending(current)) break; - vcpu_put(vcpu); schedule(); - vcpu_load(vcpu); } finish_wait(&vcpu->wq, &wait); @@ -1662,6 +1113,21 @@ void kvm_resched(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_resched); +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +{ + ktime_t expires; + DEFINE_WAIT(wait); + + prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); + + /* Sleep for 100 us, and hope lock-holder got scheduled */ + expires = ktime_add_ns(ktime_get(), 100000UL); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + + finish_wait(&vcpu->wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); + static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct kvm_vcpu *vcpu = vma->vm_file->private_data; @@ -1684,7 +1150,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static struct vm_operations_struct kvm_vcpu_vm_ops = { +static const struct vm_operations_struct kvm_vcpu_vm_ops = { .fault = kvm_vcpu_fault, }; @@ -1714,24 +1180,18 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); - if (fd < 0) - kvm_put_kvm(vcpu->kvm); - return fd; + return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); } /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *v; - if (!valid_vcpu(n)) - return -EINVAL; - - vcpu = kvm_arch_vcpu_create(kvm, n); + vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) return PTR_ERR(vcpu); @@ -1742,23 +1202,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) return r; mutex_lock(&kvm->lock); - if (kvm->vcpus[n]) { - r = -EEXIST; + if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { + r = -EINVAL; goto vcpu_destroy; } - kvm->vcpus[n] = vcpu; - mutex_unlock(&kvm->lock); + + kvm_for_each_vcpu(r, v, kvm) + if (v->vcpu_id == id) { + r = -EEXIST; + goto vcpu_destroy; + } + + BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) - goto unlink; + if (r < 0) { + kvm_put_kvm(kvm); + goto vcpu_destroy; + } + + kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; + smp_wmb(); + atomic_inc(&kvm->online_vcpus); + +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + if (kvm->bsp_vcpu_id == id) + kvm->bsp_vcpu = vcpu; +#endif + mutex_unlock(&kvm->lock); return r; -unlink: - mutex_lock(&kvm->lock); - kvm->vcpus[n] = NULL; vcpu_destroy: mutex_unlock(&kvm->lock); kvm_arch_vcpu_destroy(vcpu); @@ -1776,88 +1251,6 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) return 0; } -#ifdef __KVM_HAVE_MSIX -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = kzalloc( - sizeof(struct kvm_guest_msix_entry) * - entry_nr->entry_nr, GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} -#endif - static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2116,118 +1509,89 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif -#ifdef KVM_CAP_DEVICE_ASSIGNMENT - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; + case KVM_IRQFD: { + struct kvm_irqfd data; r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) + if (copy_from_user(&data, argp, sizeof data)) goto out; + r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); break; } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } -#ifdef KVM_CAP_ASSIGN_DEV_IRQ - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; + case KVM_IOEVENTFD: { + struct kvm_ioeventfd data; r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) + if (copy_from_user(&data, argp, sizeof data)) goto out; + r = kvm_ioeventfd(kvm, &data); break; } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_SET_BOOT_CPU_ID: + r = 0; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) != 0) + r = -EBUSY; + else + kvm->bsp_vcpu_id = arg; + mutex_unlock(&kvm->lock); break; - } -#endif #endif -#ifdef KVM_CAP_DEVICE_DEASSIGNMENT - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; + default: + r = kvm_arch_vm_ioctl(filp, ioctl, arg); + if (r == -ENOTTY) + r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } -#endif -#ifdef KVM_CAP_IRQ_ROUTING - case KVM_SET_GSI_ROUTING: { - struct kvm_irq_routing routing; - struct kvm_irq_routing __user *urouting; - struct kvm_irq_routing_entry *entries; +out: + return r; +} + +#ifdef CONFIG_COMPAT +struct compat_kvm_dirty_log { + __u32 slot; + __u32 padding1; + union { + compat_uptr_t dirty_bitmap; /* one bit per page */ + __u64 padding2; + }; +}; + +static long kvm_vm_compat_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + int r; + + if (kvm->mm != current->mm) + return -EIO; + switch (ioctl) { + case KVM_GET_DIRTY_LOG: { + struct compat_kvm_dirty_log compat_log; + struct kvm_dirty_log log; r = -EFAULT; - if (copy_from_user(&routing, argp, sizeof(routing))) - goto out; - r = -EINVAL; - if (routing.nr >= KVM_MAX_IRQ_ROUTES) - goto out; - if (routing.flags) - goto out; - r = -ENOMEM; - entries = vmalloc(routing.nr * sizeof(*entries)); - if (!entries) - goto out; - r = -EFAULT; - urouting = argp; - if (copy_from_user(entries, urouting->entries, - routing.nr * sizeof(*entries))) - goto out_free_irq_routing; - r = kvm_set_irq_routing(kvm, entries, routing.nr, - routing.flags); - out_free_irq_routing: - vfree(entries); - break; - } -#ifdef __KVM_HAVE_MSIX - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) + if (copy_from_user(&compat_log, (void __user *)arg, + sizeof(compat_log))) goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); + log.slot = compat_log.slot; + log.padding1 = compat_log.padding1; + log.padding2 = compat_log.padding2; + log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); + + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); if (r) goto out; break; } -#endif -#endif /* KVM_CAP_IRQ_ROUTING */ default: - r = kvm_arch_vm_ioctl(filp, ioctl, arg); + r = kvm_vm_ioctl(filp, ioctl, arg); } + out: return r; } +#endif static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -2250,7 +1614,7 @@ static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static struct vm_operations_struct kvm_vm_vm_ops = { +static const struct vm_operations_struct kvm_vm_vm_ops = { .fault = kvm_vm_fault, }; @@ -2263,7 +1627,9 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, - .compat_ioctl = kvm_vm_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = kvm_vm_compat_ioctl, +#endif .mmap = kvm_vm_mmap, }; @@ -2275,7 +1641,7 @@ static int kvm_dev_ioctl_create_vm(void) kvm = kvm_create_vm(); if (IS_ERR(kvm)) return PTR_ERR(kvm); - fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); + fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (fd < 0) kvm_put_kvm(kvm); @@ -2288,6 +1654,10 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_USER_MEMORY: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: +#ifdef CONFIG_KVM_APIC_ARCHITECTURE + case KVM_CAP_SET_BOOT_CPU_ID: +#endif + case KVM_CAP_INTERNAL_ERROR_DATA: return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: @@ -2335,7 +1705,7 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_TRACE_ENABLE: case KVM_TRACE_PAUSE: case KVM_TRACE_DISABLE: - r = kvm_trace_ioctl(ioctl, arg); + r = -EOPNOTSUPP; break; default: return kvm_arch_dev_ioctl(filp, ioctl, arg); @@ -2358,11 +1728,21 @@ static struct miscdevice kvm_dev = { static void hardware_enable(void *junk) { int cpu = raw_smp_processor_id(); + int r; if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; + cpumask_set_cpu(cpu, cpus_hardware_enabled); - kvm_arch_hardware_enable(NULL); + + r = kvm_arch_hardware_enable(NULL); + + if (r) { + cpumask_clear_cpu(cpu, cpus_hardware_enabled); + atomic_inc(&hardware_enable_failed); + printk(KERN_INFO "kvm: enabling virtualization on " + "CPU%d failed\n", cpu); + } } static void hardware_disable(void *junk) @@ -2375,11 +1755,52 @@ static void hardware_disable(void *junk) kvm_arch_hardware_disable(NULL); } +static void hardware_disable_all_nolock(void) +{ + BUG_ON(!kvm_usage_count); + + kvm_usage_count--; + if (!kvm_usage_count) + on_each_cpu(hardware_disable, NULL, 1); +} + +static void hardware_disable_all(void) +{ + spin_lock(&kvm_lock); + hardware_disable_all_nolock(); + spin_unlock(&kvm_lock); +} + +static int hardware_enable_all(void) +{ + int r = 0; + + spin_lock(&kvm_lock); + + kvm_usage_count++; + if (kvm_usage_count == 1) { + atomic_set(&hardware_enable_failed, 0); + on_each_cpu(hardware_enable, NULL, 1); + + if (atomic_read(&hardware_enable_failed)) { + hardware_disable_all_nolock(); + r = -EBUSY; + } + } + + spin_unlock(&kvm_lock); + + return r; +} + static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, void *v) { int cpu = (long)v; + if (!kvm_usage_count) + return NOTIFY_OK; + val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: @@ -2449,26 +1870,71 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus) } } -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, - gpa_t addr, int len, int is_write) +/* kvm_io_bus_write - called under kvm->slots_lock */ +int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, + int len, const void *val) { int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; +/* kvm_io_bus_read - called under kvm->slots_lock */ +int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) +{ + int i; + for (i = 0; i < bus->dev_count; i++) + if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) + return 0; + return -EOPNOTSUPP; +} - if (pos->in_range(pos, addr, len, is_write)) - return pos; - } +int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + int ret; - return NULL; + down_write(&kvm->slots_lock); + ret = __kvm_io_bus_register_dev(bus, dev); + up_write(&kvm->slots_lock); + + return ret; } -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +/* An unlocked version. Caller must have write lock on slots_lock. */ +int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) { - BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + if (bus->dev_count > NR_IOBUS_DEVS-1) + return -ENOSPC; bus->devs[bus->dev_count++] = dev; + + return 0; +} + +void kvm_io_bus_unregister_dev(struct kvm *kvm, + struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + down_write(&kvm->slots_lock); + __kvm_io_bus_unregister_dev(bus, dev); + up_write(&kvm->slots_lock); +} + +/* An unlocked version. Caller must have write lock on slots_lock. */ +void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) + if (bus->devs[i] == dev) { + bus->devs[i] = bus->devs[--bus->dev_count]; + break; + } } static struct notifier_block kvm_cpu_notifier = { @@ -2501,18 +1967,16 @@ static int vcpu_stat_get(void *_offset, u64 *val) *val = 0; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - *val += *(u32 *)((void *)vcpu + offset); - } + kvm_for_each_vcpu(i, vcpu, kvm) + *val += *(u32 *)((void *)vcpu + offset); + spin_unlock(&kvm_lock); return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); -static struct file_operations *stat_fops[] = { +static const struct file_operations *stat_fops[] = { [KVM_STAT_VCPU] = &vcpu_stat_fops, [KVM_STAT_VM] = &vm_stat_fops, }; @@ -2539,13 +2003,15 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { - hardware_disable(NULL); + if (kvm_usage_count) + hardware_disable(NULL); return 0; } static int kvm_resume(struct sys_device *dev) { - hardware_enable(NULL); + if (kvm_usage_count) + hardware_enable(NULL); return 0; } @@ -2590,8 +2056,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size, int r; int cpu; - kvm_init_debug(); - r = kvm_arch_init(opaque); if (r) goto out_fail; @@ -2622,7 +2086,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size, goto out_free_1; } - on_each_cpu(hardware_enable, NULL, 1); r = register_cpu_notifier(&kvm_cpu_notifier); if (r) goto out_free_2; @@ -2658,6 +2121,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size, kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; + kvm_init_debug(); + return 0; out_free: @@ -2670,7 +2135,6 @@ out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); out_free_2: - on_each_cpu(hardware_disable, NULL, 1); out_free_1: kvm_arch_hardware_unsetup(); out_free_0a: @@ -2679,7 +2143,6 @@ out_free_0: __free_page(bad_page); out: kvm_arch_exit(); - kvm_exit_debug(); out_fail: return r; } @@ -2687,7 +2150,8 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { - kvm_trace_cleanup(); + tracepoint_synchronize_unregister(); + kvm_exit_debug(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); sysdev_unregister(&kvm_sysdev); @@ -2697,7 +2161,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_exit_debug(); free_cpumask_var(cpus_hardware_enabled); __free_page(bad_page); } diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c deleted file mode 100644 index f59874446440..000000000000 --- a/virt/kvm/kvm_trace.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * kvm trace - * - * It is designed to allow debugging traces of kvm to be generated - * on UP / SMP machines. Each trace entry can be timestamped so that - * it's possible to reconstruct a chronological record of trace events. - * The implementation refers to blktrace kernel support. - * - * Copyright (c) 2008 Intel Corporation - * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> - * - * Authors: Feng(Eric) Liu, eric.e.liu@intel.com - * - * Date: Feb 2008 - */ - -#include <linux/module.h> -#include <linux/relay.h> -#include <linux/debugfs.h> -#include <linux/ktime.h> - -#include <linux/kvm_host.h> - -#define KVM_TRACE_STATE_RUNNING (1 << 0) -#define KVM_TRACE_STATE_PAUSE (1 << 1) -#define KVM_TRACE_STATE_CLEARUP (1 << 2) - -struct kvm_trace { - int trace_state; - struct rchan *rchan; - struct dentry *lost_file; - atomic_t lost_records; -}; -static struct kvm_trace *kvm_trace; - -struct kvm_trace_probe { - const char *name; - const char *format; - u32 timestamp_in; - marker_probe_func *probe_func; -}; - -static inline int calc_rec_size(int timestamp, int extra) -{ - int rec_size = KVM_TRC_HEAD_SIZE; - - rec_size += extra; - return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size; -} - -static void kvm_add_trace(void *probe_private, void *call_data, - const char *format, va_list *args) -{ - struct kvm_trace_probe *p = probe_private; - struct kvm_trace *kt = kvm_trace; - struct kvm_trace_rec rec; - struct kvm_vcpu *vcpu; - int i, size; - u32 extra; - - if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING)) - return; - - rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32)); - vcpu = va_arg(*args, struct kvm_vcpu *); - rec.pid = current->tgid; - rec.vcpu_id = vcpu->vcpu_id; - - extra = va_arg(*args, u32); - WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX)); - extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX); - - rec.rec_val |= TRACE_REC_TCS(p->timestamp_in) - | TRACE_REC_NUM_DATA_ARGS(extra); - - if (p->timestamp_in) { - rec.u.timestamp.timestamp = ktime_to_ns(ktime_get()); - - for (i = 0; i < extra; i++) - rec.u.timestamp.extra_u32[i] = va_arg(*args, u32); - } else { - for (i = 0; i < extra; i++) - rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32); - } - - size = calc_rec_size(p->timestamp_in, extra * sizeof(u32)); - relay_write(kt->rchan, &rec, size); -} - -static struct kvm_trace_probe kvm_trace_probes[] = { - { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace }, - { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace }, -}; - -static int lost_records_get(void *data, u64 *val) -{ - struct kvm_trace *kt = data; - - *val = atomic_read(&kt->lost_records); - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n"); - -/* - * The relay channel is used in "no-overwrite" mode, it keeps trace of how - * many times we encountered a full subbuffer, to tell user space app the - * lost records there were. - */ -static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct kvm_trace *kt; - - if (!relay_buf_full(buf)) { - if (!prev_subbuf) { - /* - * executed only once when the channel is opened - * save metadata as first record - */ - subbuf_start_reserve(buf, sizeof(u32)); - *(u32 *)subbuf = 0x12345678; - } - - return 1; - } - - kt = buf->chan->private_data; - atomic_inc(&kt->lost_records); - - return 0; -} - -static struct dentry *kvm_create_buf_file_callack(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static int kvm_remove_buf_file_callback(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct rchan_callbacks kvm_relay_callbacks = { - .subbuf_start = kvm_subbuf_start_callback, - .create_buf_file = kvm_create_buf_file_callack, - .remove_buf_file = kvm_remove_buf_file_callback, -}; - -static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts) -{ - struct kvm_trace *kt; - int i, r = -ENOMEM; - - if (!kuts->buf_size || !kuts->buf_nr) - return -EINVAL; - - kt = kzalloc(sizeof(*kt), GFP_KERNEL); - if (!kt) - goto err; - - r = -EIO; - atomic_set(&kt->lost_records, 0); - kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir, - kt, &kvm_trace_lost_ops); - if (!kt->lost_file) - goto err; - - kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size, - kuts->buf_nr, &kvm_relay_callbacks, kt); - if (!kt->rchan) - goto err; - - kvm_trace = kt; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - - r = marker_probe_register(p->name, p->format, p->probe_func, p); - if (r) - printk(KERN_INFO "Unable to register probe %s\n", - p->name); - } - - kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING; - - return 0; -err: - if (kt) { - if (kt->lost_file) - debugfs_remove(kt->lost_file); - if (kt->rchan) - relay_close(kt->rchan); - kfree(kt); - } - return r; -} - -static int kvm_trace_enable(char __user *arg) -{ - struct kvm_user_trace_setup kuts; - int ret; - - ret = copy_from_user(&kuts, arg, sizeof(kuts)); - if (ret) - return -EFAULT; - - ret = do_kvm_trace_enable(&kuts); - if (ret) - return ret; - - return 0; -} - -static int kvm_trace_pause(void) -{ - struct kvm_trace *kt = kvm_trace; - int r = -EINVAL; - - if (kt == NULL) - return r; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING) { - kt->trace_state = KVM_TRACE_STATE_PAUSE; - relay_flush(kt->rchan); - r = 0; - } - - return r; -} - -void kvm_trace_cleanup(void) -{ - struct kvm_trace *kt = kvm_trace; - int i; - - if (kt == NULL) - return; - - if (kt->trace_state == KVM_TRACE_STATE_RUNNING || - kt->trace_state == KVM_TRACE_STATE_PAUSE) { - - kt->trace_state = KVM_TRACE_STATE_CLEARUP; - - for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) { - struct kvm_trace_probe *p = &kvm_trace_probes[i]; - marker_probe_unregister(p->name, p->probe_func, p); - } - marker_synchronize_unregister(); - - relay_close(kt->rchan); - debugfs_remove(kt->lost_file); - kfree(kt); - } -} - -int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r = -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (ioctl) { - case KVM_TRACE_ENABLE: - r = kvm_trace_enable(argp); - break; - case KVM_TRACE_PAUSE: - r = kvm_trace_pause(); - break; - case KVM_TRACE_DISABLE: - r = 0; - kvm_trace_cleanup(); - break; - } - - return r; -} |