diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-29 10:00:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-29 10:00:14 -0700 |
commit | 4d2af08ed08ce87c4fd2379fa857153631ee8537 (patch) | |
tree | 6e2c3891290440fb902145fa7a11837f7d17f694 /drivers/iommu | |
parent | 02dc96ef6c25f990452c114c59d75c368a1f4c8f (diff) | |
parent | 2a78f9962565e53b78363eaf516eb052009e8020 (diff) | |
download | lwn-4d2af08ed08ce87c4fd2379fa857153631ee8537.tar.gz lwn-4d2af08ed08ce87c4fd2379fa857153631ee8537.zip |
Merge tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
Pull iommu fixes from Joerg Roedel:
"A couple of fixes for the AMD IOMMU driver have piled up:
- Some fixes for the reworked IO page-table which caused memory leaks
or did not allow to downgrade mappings under some conditions.
- Locking fixes to fix a couple of possible races around accessing
'struct protection_domain'. The races got introduced when the
dma-ops path became lock-less in the fast-path"
* tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
iommu/amd: Lock code paths traversing protection_domain->dev_list
iommu/amd: Lock dev_data in attach/detach code paths
iommu/amd: Check for busy devices earlier in attach_device()
iommu/amd: Take domain->lock for complete attach/detach path
iommu/amd: Remove amd_iommu_devtable_lock
iommu/amd: Remove domain->updated
iommu/amd: Wait for completion of IOTLB flush in attach_device
iommu/amd: Unmap all L7 PTEs when downgrading page-sizes
iommu/amd: Introduce first_pte_l7() helper
iommu/amd: Fix downgrading default page-sizes in alloc_pte()
iommu/amd: Fix pages leak in free_pagetable()
Diffstat (limited to 'drivers/iommu')
-rw-r--r-- | drivers/iommu/amd_iommu.c | 229 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_types.h | 4 |
2 files changed, 139 insertions, 94 deletions
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 97975bb7f347..2369b8af81f3 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -70,7 +70,6 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_SPINLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ @@ -202,6 +201,7 @@ static struct iommu_dev_data *alloc_dev_data(u16 devid) if (!dev_data) return NULL; + spin_lock_init(&dev_data->lock); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -501,6 +501,29 @@ static void iommu_uninit_device(struct device *dev) */ } +/* + * Helper function to get the first pte of a large mapping + */ +static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, + unsigned long *count) +{ + unsigned long pte_mask, pg_size, cnt; + u64 *fpte; + + pg_size = PTE_PAGE_SIZE(*pte); + cnt = PAGE_SIZE_PTE_COUNT(pg_size); + pte_mask = ~((cnt << 3) - 1); + fpte = (u64 *)(((unsigned long)pte) & pte_mask); + + if (page_size) + *page_size = pg_size; + + if (count) + *count = cnt; + + return fpte; +} + /**************************************************************************** * * Interrupt handling functions @@ -1311,8 +1334,12 @@ static void domain_flush_np_cache(struct protection_domain *domain, dma_addr_t iova, size_t size) { if (unlikely(amd_iommu_np_cache)) { + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); domain_flush_complete(domain); + spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1425,7 +1452,7 @@ static void free_pagetable(struct protection_domain *domain) BUG_ON(domain->mode < PAGE_MODE_NONE || domain->mode > PAGE_MODE_6_LEVEL); - free_sub_pt(root, domain->mode, freelist); + freelist = free_sub_pt(root, domain->mode, freelist); free_page_list(freelist); } @@ -1435,10 +1462,11 @@ static void free_pagetable(struct protection_domain *domain) * another level increases the size of the address space by 9 bits to a size up * to 64 bits. */ -static void increase_address_space(struct protection_domain *domain, +static bool increase_address_space(struct protection_domain *domain, gfp_t gfp) { unsigned long flags; + bool ret = false; u64 *pte; spin_lock_irqsave(&domain->lock, flags); @@ -1455,19 +1483,21 @@ static void increase_address_space(struct protection_domain *domain, iommu_virt_to_phys(domain->pt_root)); domain->pt_root = pte; domain->mode += 1; - domain->updated = true; + + ret = true; out: spin_unlock_irqrestore(&domain->lock, flags); - return; + return ret; } static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, unsigned long page_size, u64 **pte_page, - gfp_t gfp) + gfp_t gfp, + bool *updated) { int level, end_lvl; u64 *pte, *page; @@ -1475,7 +1505,7 @@ static u64 *alloc_pte(struct protection_domain *domain, BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); + *updated = increase_address_space(domain, gfp) || *updated; level = domain->mode - 1; pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; @@ -1489,9 +1519,32 @@ static u64 *alloc_pte(struct protection_domain *domain, __pte = *pte; pte_level = PM_PTE_LEVEL(__pte); - if (!IOMMU_PTE_PRESENT(__pte) || + /* + * If we replace a series of large PTEs, we need + * to tear down all of them. + */ + if (IOMMU_PTE_PRESENT(__pte) && pte_level == PAGE_MODE_7_LEVEL) { + unsigned long count, i; + u64 *lpte; + + lpte = first_pte_l7(pte, NULL, &count); + + /* + * Unmap the replicated PTEs that still match the + * original large mapping + */ + for (i = 0; i < count; ++i) + cmpxchg64(&lpte[i], __pte, 0ULL); + + *updated = true; + continue; + } + + if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE) { page = (u64 *)get_zeroed_page(gfp); + if (!page) return NULL; @@ -1500,8 +1553,8 @@ static u64 *alloc_pte(struct protection_domain *domain, /* pte could have been changed somewhere. */ if (cmpxchg64(pte, __pte, __npte) != __pte) free_page((unsigned long)page); - else if (pte_level == PAGE_MODE_7_LEVEL) - domain->updated = true; + else if (IOMMU_PTE_PRESENT(__pte)) + *updated = true; continue; } @@ -1566,17 +1619,12 @@ static u64 *fetch_pte(struct protection_domain *domain, *page_size = PTE_LEVEL_PAGE_SIZE(level); } - if (PM_PTE_LEVEL(*pte) == 0x07) { - unsigned long pte_mask; - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - *page_size = pte_mask = PTE_PAGE_SIZE(*pte); - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); - pte = (u64 *)(((unsigned long)pte) & pte_mask); - } + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) + pte = first_pte_l7(pte, page_size, NULL); return pte; } @@ -1615,26 +1663,29 @@ static int iommu_map_page(struct protection_domain *dom, gfp_t gfp) { struct page *freelist = NULL; + bool updated = false; u64 __pte, *pte; - int i, count; + int ret, i, count; BUG_ON(!IS_ALIGNED(bus_addr, page_size)); BUG_ON(!IS_ALIGNED(phys_addr, page_size)); + ret = -EINVAL; if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; + goto out; count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); + pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); + ret = -ENOMEM; if (!pte) - return -ENOMEM; + goto out; for (i = 0; i < count; ++i) freelist = free_clear_pte(&pte[i], pte[i], freelist); if (freelist != NULL) - dom->updated = true; + updated = true; if (count > 1) { __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); @@ -1650,12 +1701,21 @@ static int iommu_map_page(struct protection_domain *dom, for (i = 0; i < count; ++i) pte[i] = __pte; - update_domain(dom); + ret = 0; + +out: + if (updated) { + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); + update_domain(dom); + spin_unlock_irqrestore(&dom->lock, flags); + } /* Everything flushed out, free pages now */ free_page_list(freelist); - return 0; + return ret; } static unsigned long iommu_unmap_page(struct protection_domain *dom, @@ -1806,8 +1866,12 @@ static void free_gcr3_table(struct protection_domain *domain) static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) { + unsigned long flags; + + spin_lock_irqsave(&dom->domain.lock, flags); domain_flush_tlb(&dom->domain); domain_flush_complete(&dom->domain); + spin_unlock_irqrestore(&dom->domain.lock, flags); } static void iova_domain_flush_tlb(struct iova_domain *iovad) @@ -2022,36 +2086,6 @@ static void do_detach(struct iommu_dev_data *dev_data) domain->dev_cnt -= 1; } -/* - * If a device is not yet associated with a domain, this function makes the - * device visible in the domain - */ -static int __attach_device(struct iommu_dev_data *dev_data, - struct protection_domain *domain) -{ - int ret; - - /* lock domain */ - spin_lock(&domain->lock); - - ret = -EBUSY; - if (dev_data->domain != NULL) - goto out_unlock; - - /* Attach alias group root */ - do_attach(dev_data, domain); - - ret = 0; - -out_unlock: - - /* ready */ - spin_unlock(&domain->lock); - - return ret; -} - - static void pdev_iommuv2_disable(struct pci_dev *pdev) { pci_disable_ats(pdev); @@ -2133,19 +2167,28 @@ static int attach_device(struct device *dev, unsigned long flags; int ret; + spin_lock_irqsave(&domain->lock, flags); + dev_data = get_dev_data(dev); + spin_lock(&dev_data->lock); + + ret = -EBUSY; + if (dev_data->domain != NULL) + goto out; + if (!dev_is_pci(dev)) goto skip_ats_check; pdev = to_pci_dev(dev); if (domain->flags & PD_IOMMUV2_MASK) { + ret = -EINVAL; if (!dev_data->passthrough) - return -EINVAL; + goto out; if (dev_data->iommu_v2) { if (pdev_iommuv2_enable(pdev) != 0) - return -EINVAL; + goto out; dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); @@ -2158,9 +2201,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); - ret = __attach_device(dev_data, domain); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + ret = 0; + + do_attach(dev_data, domain); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2169,23 +2212,14 @@ skip_ats_check: */ domain_flush_tlb_pde(domain); - return ret; -} - -/* - * Removes a device from a protection domain (unlocked) - */ -static void __detach_device(struct iommu_dev_data *dev_data) -{ - struct protection_domain *domain; - - domain = dev_data->domain; + domain_flush_complete(domain); - spin_lock(&domain->lock); +out: + spin_unlock(&dev_data->lock); - do_detach(dev_data); + spin_unlock_irqrestore(&domain->lock, flags); - spin_unlock(&domain->lock); + return ret; } /* @@ -2200,6 +2234,10 @@ static void detach_device(struct device *dev) dev_data = get_dev_data(dev); domain = dev_data->domain; + spin_lock_irqsave(&domain->lock, flags); + + spin_lock(&dev_data->lock); + /* * First check if the device is still attached. It might already * be detached from its domain because the generic @@ -2207,15 +2245,12 @@ static void detach_device(struct device *dev) * our alias handling. */ if (WARN_ON(!dev_data->domain)) - return; + goto out; - /* lock device table */ - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(dev_data); - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + do_detach(dev_data); if (!dev_is_pci(dev)) - return; + goto out; if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) pdev_iommuv2_disable(to_pci_dev(dev)); @@ -2223,6 +2258,11 @@ static void detach_device(struct device *dev) pci_disable_ats(to_pci_dev(dev)); dev_data->ats.enabled = false; + +out: + spin_unlock(&dev_data->lock); + + spin_unlock_irqrestore(&domain->lock, flags); } static int amd_iommu_add_device(struct device *dev) @@ -2354,15 +2394,10 @@ static void update_device_table(struct protection_domain *domain) static void update_domain(struct protection_domain *domain) { - if (!domain->updated) - return; - update_device_table(domain); domain_flush_devices(domain); domain_flush_tlb_pde(domain); - - domain->updated = false; } static int dir2prot(enum dma_data_direction direction) @@ -2392,6 +2427,7 @@ static dma_addr_t __map_single(struct device *dev, { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start, ret; + unsigned long flags; unsigned int pages; int prot = 0; int i; @@ -2429,8 +2465,10 @@ out_unmap: iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE); } + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, address, pages); @@ -2459,8 +2497,12 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, } if (amd_iommu_unmap_flush) { + unsigned long flags; + + spin_lock_irqsave(&dma_dom->domain.lock, flags); domain_flush_tlb(&dma_dom->domain); domain_flush_complete(&dma_dom->domain); + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); dma_ops_free_iova(dma_dom, dma_addr, pages); } else { pages = __roundup_pow_of_two(pages); @@ -2866,16 +2908,16 @@ static void cleanup_domain(struct protection_domain *domain) struct iommu_dev_data *entry; unsigned long flags; - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); + spin_lock_irqsave(&domain->lock, flags); while (!list_empty(&domain->dev_list)) { entry = list_first_entry(&domain->dev_list, struct iommu_dev_data, list); BUG_ON(!entry->domain); - __detach_device(entry); + do_detach(entry); } - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); } static void protection_domain_free(struct protection_domain *domain) @@ -3226,9 +3268,12 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + spin_lock_irqsave(&dom->lock, flags); domain_flush_tlb_pde(dom); domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); } static void amd_iommu_iotlb_sync(struct iommu_domain *domain, @@ -3290,7 +3335,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) /* Update data structure */ domain->mode = PAGE_MODE_NONE; - domain->updated = true; /* Make changes visible to IOMMUs */ update_domain(domain); @@ -3336,7 +3380,6 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) domain->glx = levels; domain->flags |= PD_IOMMUV2_MASK; - domain->updated = true; update_domain(domain); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 9ac229e92b07..c9c1612d52e0 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -475,7 +475,6 @@ struct protection_domain { int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ - bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; @@ -634,6 +633,9 @@ struct devid_map { * This struct contains device specific data for the IOMMU */ struct iommu_dev_data { + /*Protect against attach/detach races */ + spinlock_t lock; + struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ |