summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-29 10:00:14 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-29 10:00:14 -0700
commit4d2af08ed08ce87c4fd2379fa857153631ee8537 (patch)
tree6e2c3891290440fb902145fa7a11837f7d17f694 /drivers/iommu
parent02dc96ef6c25f990452c114c59d75c368a1f4c8f (diff)
parent2a78f9962565e53b78363eaf516eb052009e8020 (diff)
downloadlwn-4d2af08ed08ce87c4fd2379fa857153631ee8537.tar.gz
lwn-4d2af08ed08ce87c4fd2379fa857153631ee8537.zip
Merge tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
Pull iommu fixes from Joerg Roedel: "A couple of fixes for the AMD IOMMU driver have piled up: - Some fixes for the reworked IO page-table which caused memory leaks or did not allow to downgrade mappings under some conditions. - Locking fixes to fix a couple of possible races around accessing 'struct protection_domain'. The races got introduced when the dma-ops path became lock-less in the fast-path" * tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: iommu/amd: Lock code paths traversing protection_domain->dev_list iommu/amd: Lock dev_data in attach/detach code paths iommu/amd: Check for busy devices earlier in attach_device() iommu/amd: Take domain->lock for complete attach/detach path iommu/amd: Remove amd_iommu_devtable_lock iommu/amd: Remove domain->updated iommu/amd: Wait for completion of IOTLB flush in attach_device iommu/amd: Unmap all L7 PTEs when downgrading page-sizes iommu/amd: Introduce first_pte_l7() helper iommu/amd: Fix downgrading default page-sizes in alloc_pte() iommu/amd: Fix pages leak in free_pagetable()
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/amd_iommu.c229
-rw-r--r--drivers/iommu/amd_iommu_types.h4
2 files changed, 139 insertions, 94 deletions
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 97975bb7f347..2369b8af81f3 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -70,7 +70,6 @@
*/
#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38))
-static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
static DEFINE_SPINLOCK(pd_bitmap_lock);
/* List of all available dev_data structures */
@@ -202,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid)
if (!dev_data)
return NULL;
+ spin_lock_init(&dev_data->lock);
dev_data->devid = devid;
ratelimit_default_init(&dev_data->rs);
@@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev)
*/
}
+/*
+ * Helper function to get the first pte of a large mapping
+ */
+static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
+ unsigned long *count)
+{
+ unsigned long pte_mask, pg_size, cnt;
+ u64 *fpte;
+
+ pg_size = PTE_PAGE_SIZE(*pte);
+ cnt = PAGE_SIZE_PTE_COUNT(pg_size);
+ pte_mask = ~((cnt << 3) - 1);
+ fpte = (u64 *)(((unsigned long)pte) & pte_mask);
+
+ if (page_size)
+ *page_size = pg_size;
+
+ if (count)
+ *count = cnt;
+
+ return fpte;
+}
+
/****************************************************************************
*
* Interrupt handling functions
@@ -1311,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain,
dma_addr_t iova, size_t size)
{
if (unlikely(amd_iommu_np_cache)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&domain->lock, flags);
domain_flush_pages(domain, iova, size);
domain_flush_complete(domain);
+ spin_unlock_irqrestore(&domain->lock, flags);
}
}
@@ -1425,7 +1452,7 @@ static void free_pagetable(struct protection_domain *domain)
BUG_ON(domain->mode < PAGE_MODE_NONE ||
domain->mode > PAGE_MODE_6_LEVEL);
- free_sub_pt(root, domain->mode, freelist);
+ freelist = free_sub_pt(root, domain->mode, freelist);
free_page_list(freelist);
}
@@ -1435,10 +1462,11 @@ static void free_pagetable(struct protection_domain *domain)
* another level increases the size of the address space by 9 bits to a size up
* to 64 bits.
*/
-static void increase_address_space(struct protection_domain *domain,
+static bool increase_address_space(struct protection_domain *domain,
gfp_t gfp)
{
unsigned long flags;
+ bool ret = false;
u64 *pte;
spin_lock_irqsave(&domain->lock, flags);
@@ -1455,19 +1483,21 @@ static void increase_address_space(struct protection_domain *domain,
iommu_virt_to_phys(domain->pt_root));
domain->pt_root = pte;
domain->mode += 1;
- domain->updated = true;
+
+ ret = true;
out:
spin_unlock_irqrestore(&domain->lock, flags);
- return;
+ return ret;
}
static u64 *alloc_pte(struct protection_domain *domain,
unsigned long address,
unsigned long page_size,
u64 **pte_page,
- gfp_t gfp)
+ gfp_t gfp,
+ bool *updated)
{
int level, end_lvl;
u64 *pte, *page;
@@ -1475,7 +1505,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
+ *updated = increase_address_space(domain, gfp) || *updated;
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
@@ -1489,9 +1519,32 @@ static u64 *alloc_pte(struct protection_domain *domain,
__pte = *pte;
pte_level = PM_PTE_LEVEL(__pte);
- if (!IOMMU_PTE_PRESENT(__pte) ||
+ /*
+ * If we replace a series of large PTEs, we need
+ * to tear down all of them.
+ */
+ if (IOMMU_PTE_PRESENT(__pte) &&
pte_level == PAGE_MODE_7_LEVEL) {
+ unsigned long count, i;
+ u64 *lpte;
+
+ lpte = first_pte_l7(pte, NULL, &count);
+
+ /*
+ * Unmap the replicated PTEs that still match the
+ * original large mapping
+ */
+ for (i = 0; i < count; ++i)
+ cmpxchg64(&lpte[i], __pte, 0ULL);
+
+ *updated = true;
+ continue;
+ }
+
+ if (!IOMMU_PTE_PRESENT(__pte) ||
+ pte_level == PAGE_MODE_NONE) {
page = (u64 *)get_zeroed_page(gfp);
+
if (!page)
return NULL;
@@ -1500,8 +1553,8 @@ static u64 *alloc_pte(struct protection_domain *domain,
/* pte could have been changed somewhere. */
if (cmpxchg64(pte, __pte, __npte) != __pte)
free_page((unsigned long)page);
- else if (pte_level == PAGE_MODE_7_LEVEL)
- domain->updated = true;
+ else if (IOMMU_PTE_PRESENT(__pte))
+ *updated = true;
continue;
}
@@ -1566,17 +1619,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
*page_size = PTE_LEVEL_PAGE_SIZE(level);
}
- if (PM_PTE_LEVEL(*pte) == 0x07) {
- unsigned long pte_mask;
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- *page_size = pte_mask = PTE_PAGE_SIZE(*pte);
- pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
- pte = (u64 *)(((unsigned long)pte) & pte_mask);
- }
+ /*
+ * If we have a series of large PTEs, make
+ * sure to return a pointer to the first one.
+ */
+ if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
+ pte = first_pte_l7(pte, page_size, NULL);
return pte;
}
@@ -1615,26 +1663,29 @@ static int iommu_map_page(struct protection_domain *dom,
gfp_t gfp)
{
struct page *freelist = NULL;
+ bool updated = false;
u64 __pte, *pte;
- int i, count;
+ int ret, i, count;
BUG_ON(!IS_ALIGNED(bus_addr, page_size));
BUG_ON(!IS_ALIGNED(phys_addr, page_size));
+ ret = -EINVAL;
if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
+ goto out;
count = PAGE_SIZE_PTE_COUNT(page_size);
- pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
+ pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);
+ ret = -ENOMEM;
if (!pte)
- return -ENOMEM;
+ goto out;
for (i = 0; i < count; ++i)
freelist = free_clear_pte(&pte[i], pte[i], freelist);
if (freelist != NULL)
- dom->updated = true;
+ updated = true;
if (count > 1) {
__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@@ -1650,12 +1701,21 @@ static int iommu_map_page(struct protection_domain *dom,
for (i = 0; i < count; ++i)
pte[i] = __pte;
- update_domain(dom);
+ ret = 0;
+
+out:
+ if (updated) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->lock, flags);
+ update_domain(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
+ }
/* Everything flushed out, free pages now */
free_page_list(freelist);
- return 0;
+ return ret;
}
static unsigned long iommu_unmap_page(struct protection_domain *dom,
@@ -1806,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain)
static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dom->domain.lock, flags);
domain_flush_tlb(&dom->domain);
domain_flush_complete(&dom->domain);
+ spin_unlock_irqrestore(&dom->domain.lock, flags);
}
static void iova_domain_flush_tlb(struct iova_domain *iovad)
@@ -2022,36 +2086,6 @@ static void do_detach(struct iommu_dev_data *dev_data)
domain->dev_cnt -= 1;
}
-/*
- * If a device is not yet associated with a domain, this function makes the
- * device visible in the domain
- */
-static int __attach_device(struct iommu_dev_data *dev_data,
- struct protection_domain *domain)
-{
- int ret;
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- ret = -EBUSY;
- if (dev_data->domain != NULL)
- goto out_unlock;
-
- /* Attach alias group root */
- do_attach(dev_data, domain);
-
- ret = 0;
-
-out_unlock:
-
- /* ready */
- spin_unlock(&domain->lock);
-
- return ret;
-}
-
-
static void pdev_iommuv2_disable(struct pci_dev *pdev)
{
pci_disable_ats(pdev);
@@ -2133,19 +2167,28 @@ static int attach_device(struct device *dev,
unsigned long flags;
int ret;
+ spin_lock_irqsave(&domain->lock, flags);
+
dev_data = get_dev_data(dev);
+ spin_lock(&dev_data->lock);
+
+ ret = -EBUSY;
+ if (dev_data->domain != NULL)
+ goto out;
+
if (!dev_is_pci(dev))
goto skip_ats_check;
pdev = to_pci_dev(dev);
if (domain->flags & PD_IOMMUV2_MASK) {
+ ret = -EINVAL;
if (!dev_data->passthrough)
- return -EINVAL;
+ goto out;
if (dev_data->iommu_v2) {
if (pdev_iommuv2_enable(pdev) != 0)
- return -EINVAL;
+ goto out;
dev_data->ats.enabled = true;
dev_data->ats.qdep = pci_ats_queue_depth(pdev);
@@ -2158,9 +2201,9 @@ static int attach_device(struct device *dev,
}
skip_ats_check:
- spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
- ret = __attach_device(dev_data, domain);
- spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ ret = 0;
+
+ do_attach(dev_data, domain);
/*
* We might boot into a crash-kernel here. The crashed kernel
@@ -2169,23 +2212,14 @@ skip_ats_check:
*/
domain_flush_tlb_pde(domain);
- return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct iommu_dev_data *dev_data)
-{
- struct protection_domain *domain;
-
- domain = dev_data->domain;
+ domain_flush_complete(domain);
- spin_lock(&domain->lock);
+out:
+ spin_unlock(&dev_data->lock);
- do_detach(dev_data);
+ spin_unlock_irqrestore(&domain->lock, flags);
- spin_unlock(&domain->lock);
+ return ret;
}
/*
@@ -2200,6 +2234,10 @@ static void detach_device(struct device *dev)
dev_data = get_dev_data(dev);
domain = dev_data->domain;
+ spin_lock_irqsave(&domain->lock, flags);
+
+ spin_lock(&dev_data->lock);
+
/*
* First check if the device is still attached. It might already
* be detached from its domain because the generic
@@ -2207,15 +2245,12 @@ static void detach_device(struct device *dev)
* our alias handling.
*/
if (WARN_ON(!dev_data->domain))
- return;
+ goto out;
- /* lock device table */
- spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(dev_data);
- spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ do_detach(dev_data);
if (!dev_is_pci(dev))
- return;
+ goto out;
if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
pdev_iommuv2_disable(to_pci_dev(dev));
@@ -2223,6 +2258,11 @@ static void detach_device(struct device *dev)
pci_disable_ats(to_pci_dev(dev));
dev_data->ats.enabled = false;
+
+out:
+ spin_unlock(&dev_data->lock);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
}
static int amd_iommu_add_device(struct device *dev)
@@ -2354,15 +2394,10 @@ static void update_device_table(struct protection_domain *domain)
static void update_domain(struct protection_domain *domain)
{
- if (!domain->updated)
- return;
-
update_device_table(domain);
domain_flush_devices(domain);
domain_flush_tlb_pde(domain);
-
- domain->updated = false;
}
static int dir2prot(enum dma_data_direction direction)
@@ -2392,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev,
{
dma_addr_t offset = paddr & ~PAGE_MASK;
dma_addr_t address, start, ret;
+ unsigned long flags;
unsigned int pages;
int prot = 0;
int i;
@@ -2429,8 +2465,10 @@ out_unmap:
iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
}
+ spin_lock_irqsave(&dma_dom->domain.lock, flags);
domain_flush_tlb(&dma_dom->domain);
domain_flush_complete(&dma_dom->domain);
+ spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
dma_ops_free_iova(dma_dom, address, pages);
@@ -2459,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
}
if (amd_iommu_unmap_flush) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dma_dom->domain.lock, flags);
domain_flush_tlb(&dma_dom->domain);
domain_flush_complete(&dma_dom->domain);
+ spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
dma_ops_free_iova(dma_dom, dma_addr, pages);
} else {
pages = __roundup_pow_of_two(pages);
@@ -2866,16 +2908,16 @@ static void cleanup_domain(struct protection_domain *domain)
struct iommu_dev_data *entry;
unsigned long flags;
- spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ spin_lock_irqsave(&domain->lock, flags);
while (!list_empty(&domain->dev_list)) {
entry = list_first_entry(&domain->dev_list,
struct iommu_dev_data, list);
BUG_ON(!entry->domain);
- __detach_device(entry);
+ do_detach(entry);
}
- spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+ spin_unlock_irqrestore(&domain->lock, flags);
}
static void protection_domain_free(struct protection_domain *domain)
@@ -3226,9 +3268,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
{
struct protection_domain *dom = to_pdomain(domain);
+ unsigned long flags;
+ spin_lock_irqsave(&dom->lock, flags);
domain_flush_tlb_pde(dom);
domain_flush_complete(dom);
+ spin_unlock_irqrestore(&dom->lock, flags);
}
static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
@@ -3290,7 +3335,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom)
/* Update data structure */
domain->mode = PAGE_MODE_NONE;
- domain->updated = true;
/* Make changes visible to IOMMUs */
update_domain(domain);
@@ -3336,7 +3380,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
domain->glx = levels;
domain->flags |= PD_IOMMUV2_MASK;
- domain->updated = true;
update_domain(domain);
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 9ac229e92b07..c9c1612d52e0 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -475,7 +475,6 @@ struct protection_domain {
int glx; /* Number of levels for GCR3 table */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
- bool updated; /* complete domain flush required */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
@@ -634,6 +633,9 @@ struct devid_map {
* This struct contains device specific data for the IOMMU
*/
struct iommu_dev_data {
+ /*Protect against attach/detach races */
+ spinlock_t lock;
+
struct list_head list; /* For domain->dev_list */
struct llist_node dev_data_list; /* For global dev_data_list */
struct protection_domain *domain; /* Domain the device is bound to */