diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-14 19:42:11 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-14 19:42:11 -0700 |
commit | fec88ab0af9706b2201e5daf377c5031c62d11f7 (patch) | |
tree | 7206e8a3ff2dea87f912f4660d453a8c118248ac /mm/hmm.c | |
parent | fa6e951a2a440babd7a7310d0f4713e618061767 (diff) | |
parent | cc5dfd59e375f4d0f2b64643723d16b38b2f2d78 (diff) | |
download | lwn-fec88ab0af9706b2201e5daf377c5031c62d11f7.tar.gz lwn-fec88ab0af9706b2201e5daf377c5031c62d11f7.zip |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull HMM updates from Jason Gunthorpe:
"Improvements and bug fixes for the hmm interface in the kernel:
- Improve clarity, locking and APIs related to the 'hmm mirror'
feature merged last cycle. In linux-next we now see AMDGPU and
nouveau to be using this API.
- Remove old or transitional hmm APIs. These are hold overs from the
past with no users, or APIs that existed only to manage cross tree
conflicts. There are still a few more of these cleanups that didn't
make the merge window cut off.
- Improve some core mm APIs:
- export alloc_pages_vma() for driver use
- refactor into devm_request_free_mem_region() to manage
DEVICE_PRIVATE resource reservations
- refactor duplicative driver code into the core dev_pagemap
struct
- Remove hmm wrappers of improved core mm APIs, instead have drivers
use the simplified API directly
- Remove DEVICE_PUBLIC
- Simplify the kconfig flow for the hmm users and core code"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (42 commits)
mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
mm: remove the HMM config option
mm: sort out the DEVICE_PRIVATE Kconfig mess
mm: simplify ZONE_DEVICE page private data
mm: remove hmm_devmem_add
mm: remove hmm_vma_alloc_locked_page
nouveau: use devm_memremap_pages directly
nouveau: use alloc_page_vma directly
PCI/P2PDMA: use the dev_pagemap internal refcount
device-dax: use the dev_pagemap internal refcount
memremap: provide an optional internal refcount in struct dev_pagemap
memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag
memremap: remove the data field in struct dev_pagemap
memremap: add a migrate_to_ram method to struct dev_pagemap_ops
memremap: lift the devmap_enable manipulation into devm_memremap_pages
memremap: pass a struct dev_pagemap to ->kill and ->cleanup
memremap: move dev_pagemap callbacks into a separate structure
memremap: validate the pagemap type passed to devm_memremap_pages
mm: factor out a devm_request_free_mem_region helper
mm: export alloc_pages_vma
...
Diffstat (limited to 'mm/hmm.c')
-rw-r--r-- | mm/hmm.c | 587 |
1 files changed, 131 insertions, 456 deletions
@@ -20,26 +20,14 @@ #include <linux/swapops.h> #include <linux/hugetlb.h> #include <linux/memremap.h> +#include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -static inline struct hmm *mm_get_hmm(struct mm_struct *mm) -{ - struct hmm *hmm = READ_ONCE(mm->hmm); - - if (hmm && kref_get_unless_zero(&hmm->kref)) - return hmm; - - return NULL; -} - /** * hmm_get_or_create - register HMM against an mm (HMM internal) * @@ -54,11 +42,16 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm) */ static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); - bool cleanup = false; + struct hmm *hmm; - if (hmm) - return hmm; + lockdep_assert_held_write(&mm->mmap_sem); + + /* Abuse the page_table_lock to also protect mm->hmm. */ + spin_lock(&mm->page_table_lock); + hmm = mm->hmm; + if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) + goto out_unlock; + spin_unlock(&mm->page_table_lock); hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) @@ -68,55 +61,50 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - mutex_init(&hmm->lock); + spin_lock_init(&hmm->ranges_lock); kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->dead = false; hmm->mm = mm; - spin_lock(&mm->page_table_lock); - if (!mm->hmm) - mm->hmm = hmm; - else - cleanup = true; - spin_unlock(&mm->page_table_lock); + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } - if (cleanup) - goto error; + mmgrab(hmm->mm); /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() + * We hold the exclusive mmap_sem here so we know that mm->hmm is + * still NULL or 0 kref, and is safe to update. */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) - goto error_mm; + spin_lock(&mm->page_table_lock); + mm->hmm = hmm; +out_unlock: + spin_unlock(&mm->page_table_lock); return hmm; +} -error_mm: - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); -error: +static void hmm_free_rcu(struct rcu_head *rcu) +{ + struct hmm *hmm = container_of(rcu, struct hmm, rcu); + + mmdrop(hmm->mm); kfree(hmm); - return NULL; } static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); - struct mm_struct *mm = hmm->mm; - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&hmm->mm->page_table_lock); + if (hmm->mm->hmm == hmm) + hmm->mm->hmm = NULL; + spin_unlock(&hmm->mm->page_table_lock); - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); + mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); } static inline void hmm_put(struct hmm *hmm) @@ -124,86 +112,73 @@ static inline void hmm_put(struct hmm *hmm) kref_put(&hmm->kref, hmm_free); } -void hmm_mm_destroy(struct mm_struct *mm) +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct hmm *hmm; + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); + struct hmm_mirror *mirror; - spin_lock(&mm->page_table_lock); - hmm = mm_get_hmm(mm); - mm->hmm = NULL; - if (hmm) { - hmm->mm = NULL; - hmm->dead = true; - spin_unlock(&mm->page_table_lock); - hmm_put(hmm); + /* Bail out if hmm is in the process of being freed */ + if (!kref_get_unless_zero(&hmm->kref)) return; + + /* + * Since hmm_range_register() holds the mmget() lock hmm_release() is + * prevented as long as a range exists. + */ + WARN_ON(!list_empty_careful(&hmm->ranges)); + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) { + /* + * Note: The driver is not allowed to trigger + * hmm_mirror_unregister() from this thread. + */ + if (mirror->ops->release) + mirror->ops->release(mirror); } + up_read(&hmm->mirrors_sem); - spin_unlock(&mm->page_table_lock); + hmm_put(hmm); } -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +static void notifiers_decrement(struct hmm *hmm) { - struct hmm *hmm = mm_get_hmm(mm); - struct hmm_mirror *mirror; - struct hmm_range *range; - - /* Report this HMM as dying. */ - hmm->dead = true; + unsigned long flags; - /* Wake-up everyone waiting on any range. */ - mutex_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - range->valid = false; - } - wake_up_all(&hmm->wq); - mutex_unlock(&hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; - down_write(&hmm->mirrors_sem); - mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, - list); - while (mirror) { - list_del_init(&mirror->list); - if (mirror->ops->release) { - /* - * Drop mirrors_sem so callback can wait on any pending - * work that might itself trigger mmu_notifier callback - * and thus would deadlock with us. - */ - up_write(&hmm->mirrors_sem); - mirror->ops->release(mirror); - down_write(&hmm->mirrors_sem); + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; } - mirror = list_first_entry_or_null(&hmm->mirrors, - struct hmm_mirror, list); + wake_up_all(&hmm->wq); } - up_write(&hmm->mirrors_sem); - - hmm_put(hmm); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; + unsigned long flags; int ret = 0; - VM_BUG_ON(!hmm); + if (!kref_get_unless_zero(&hmm->kref)) + return 0; update.start = nrange->start; update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; update.blockable = mmu_notifier_range_blockable(nrange); - if (mmu_notifier_range_blockable(nrange)) - mutex_lock(&hmm->lock); - else if (!mutex_trylock(&hmm->lock)) { - ret = -EAGAIN; - goto out; - } + spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { if (update.end < range->start || update.start >= range->end) @@ -211,7 +186,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, range->valid = false; } - mutex_unlock(&hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); if (mmu_notifier_range_blockable(nrange)) down_read(&hmm->mirrors_sem); @@ -219,19 +194,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, ret = -EAGAIN; goto out; } + list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; + int rc; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); - if (!update.blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (rc) { + if (WARN_ON(update.blockable || rc != -EAGAIN)) + continue; ret = -EAGAIN; - goto out; + break; } } up_read(&hmm->mirrors_sem); out: + if (ret) + notifiers_decrement(hmm); hmm_put(hmm); return ret; } @@ -239,24 +218,12 @@ out: static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); - - VM_BUG_ON(!hmm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - mutex_lock(&hmm->lock); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - mutex_unlock(&hmm->lock); + if (!kref_get_unless_zero(&hmm->kref)) + return; + notifiers_decrement(hmm); hmm_put(hmm); } @@ -271,14 +238,15 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * @mirror: new mirror struct to register * @mm: mm to register against + * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. - * - * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + lockdep_assert_held_write(&mm->mmap_sem); + /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; @@ -298,23 +266,17 @@ EXPORT_SYMBOL(hmm_mirror_register); /* * hmm_mirror_unregister() - unregister a mirror * - * @mirror: new mirror struct to register + * @mirror: mirror struct to unregister * * Stop mirroring a process address space, and cleanup. */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = READ_ONCE(mirror->hmm); - - if (hmm == NULL) - return; + struct hmm *hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del_init(&mirror->list); - /* To protect us against double unregister ... */ - mirror->hmm = NULL; + list_del(&mirror->list); up_write(&hmm->mirrors_sem); - hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -330,7 +292,7 @@ struct hmm_vma_walk { static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, bool write_fault, uint64_t *pfn) { - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + unsigned int flags = FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; @@ -372,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EBUSY after page fault, or page fault error + * Return: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -550,7 +512,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { - if (pte_none(pte) || !pte_present(pte)) + if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; return pte_write(pte) ? range->flags[HMM_PFN_VALID] | range->flags[HMM_PFN_WRITE] : @@ -788,7 +750,6 @@ again: return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); -#ifdef CONFIG_HUGETLB_PAGE pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, @@ -804,9 +765,6 @@ again: } hmm_vma_walk->last = end; return 0; -#else - return -EINVAL; -#endif } split_huge_pud(walk->vma, pudp, addr); @@ -909,12 +867,14 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); + struct hmm *hmm = mirror->hmm; + unsigned long flags; range->valid = false; range->hmm = NULL; @@ -928,28 +888,24 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - range->hmm = hmm_get_or_create(mm); - if (!range->hmm) - return -EFAULT; - - /* Check if hmm_mm_destroy() was call. */ - if (range->hmm->mm == NULL || range->hmm->dead) { - hmm_put(range->hmm); + /* Prevent hmm_release() from running while the range is valid */ + if (!mmget_not_zero(hmm->mm)) return -EFAULT; - } - /* Initialize range to track CPU page table update */ - mutex_lock(&range->hmm->lock); + /* Initialize range to track CPU page table updates. */ + spin_lock_irqsave(&hmm->ranges_lock, flags); - list_add_rcu(&range->list, &range->hmm->ranges); + range->hmm = hmm; + kref_get(&hmm->kref); + list_add(&range->list, &hmm->ranges); /* * If there are any concurrent notifiers we have to wait for them for * the range to be valid (see hmm_range_wait_until_valid()). */ - if (!range->hmm->notifiers) + if (!hmm->notifiers) range->valid = true; - mutex_unlock(&range->hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); return 0; } @@ -964,25 +920,31 @@ EXPORT_SYMBOL(hmm_range_register); */ void hmm_range_unregister(struct hmm_range *range) { - /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) - return; + struct hmm *hmm = range->hmm; + unsigned long flags; - mutex_lock(&range->hmm->lock); - list_del_rcu(&range->list); - mutex_unlock(&range->hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + list_del_init(&range->list); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); /* Drop reference taken by hmm_range_register() */ + mmput(hmm->mm); + hmm_put(hmm); + + /* + * The range is now invalid and the ref on the hmm is dropped, so + * poison the pointer. Leave other fields in place, for the caller's + * use. + */ range->valid = false; - hmm_put(range->hmm); - range->hmm = NULL; + memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); } EXPORT_SYMBOL(hmm_range_unregister); /* * hmm_range_snapshot() - snapshot CPU page table for a range * @range: range - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid * permission (for instance asking for write and range is read only), * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid * vma or it is illegal to access that range), number of valid pages @@ -1001,10 +963,7 @@ long hmm_range_snapshot(struct hmm_range *range) struct vm_area_struct *vma; struct mm_walk mm_walk; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; - + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) @@ -1015,9 +974,8 @@ long hmm_range_snapshot(struct hmm_range *range) return -EFAULT; if (is_vm_hugetlb_page(vma)) { - struct hstate *h = hstate_vma(vma); - - if (huge_page_shift(h) != range->page_shift && + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && range->page_shift != PAGE_SHIFT) return -EINVAL; } else { @@ -1066,7 +1024,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of valid pages in range->pfns[] (from range start + * Return: number of valid pages in range->pfns[] (from range start * address). This may be zero. If the return value is negative, * then one of the following values may be returned: * @@ -1100,9 +1058,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) struct mm_walk mm_walk; int ret; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ @@ -1184,7 +1140,7 @@ EXPORT_SYMBOL(hmm_range_fault); * @device: device against to dma map page to * @daddrs: dma address of mapped pages * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been * drop and you need to try again, some other error value otherwise * * Note same usage pattern as hmm_range_fault(). @@ -1272,7 +1228,7 @@ EXPORT_SYMBOL(hmm_range_dma_map); * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set - * Returns: number of page unmapped on success, -EINVAL otherwise + * Return: number of page unmapped on success, -EINVAL otherwise * * Note that caller MUST abide by mmu notifier or use HMM mirror and abide * to the sync_cpu_device_pagetables() callback so that it is safe here to @@ -1328,284 +1284,3 @@ long hmm_range_dma_unmap(struct hmm_range *range, return cpages; } EXPORT_SYMBOL(hmm_range_dma_unmap); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!page) - return NULL; - lock_page(page); - return page; -} -EXPORT_SYMBOL(hmm_vma_alloc_locked_page); - - -static void hmm_devmem_ref_release(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - complete(&devmem->completion); -} - -static void hmm_devmem_ref_exit(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); -} - -static void hmm_devmem_ref_kill(struct percpu_ref *ref) -{ - percpu_ref_kill(ref); -} - -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) -{ - struct hmm_devmem *devmem = page->pgmap->data; - - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); -} - -static void hmm_devmem_free(struct page *page, void *data) -{ - struct hmm_devmem *devmem = data; - - page->mapping = NULL; - - devmem->ops->free(devmem, page); -} - -/* - * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory - * - * @ops: memory event device driver callback (see struct hmm_devmem_ops) - * @device: device struct to bind the resource too - * @size: size in bytes of the device memory to add - * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise - * - * This function first finds an empty range of physical address big enough to - * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which - * in turn allocates struct pages. It does not do anything beyond that; all - * events affecting the memory will go through the various callbacks provided - * by hmm_devmem_ops struct. - * - * Device driver should call this function during device initialization and - * is then responsible of memory management. HMM only provides helpers. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size) -{ - struct hmm_devmem *devmem; - resource_size_t addr; - void *result; - int ret; - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = NULL; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - size = ALIGN(size, PA_SECTION_SIZE); - addr = min((unsigned long)iomem_resource.end, - (1UL << MAX_PHYSMEM_BITS) - 1); - addr = addr - size + 1UL; - - /* - * FIXME add a new helper to quickly walk resource tree and find free - * range - * - * FIXME what about ioport_resource resource ? - */ - for (; addr > size && addr >= iomem_resource.start; addr -= size) { - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); - if (ret != REGION_DISJOINT) - continue; - - devmem->resource = devm_request_mem_region(device, addr, size, - dev_name(device)); - if (!devmem->resource) - return ERR_PTR(-ENOMEM); - break; - } - if (!devmem->resource) - return ERR_PTR(-ERANGE); - - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add); - -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res) -{ - struct hmm_devmem *devmem; - void *result; - int ret; - - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) - return ERR_PTR(-EINVAL); - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = res; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper - * and it is not needed to make use of any HMM functionality. - */ -#define HMM_DEVICE_MAX 256 - -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); -static DEFINE_SPINLOCK(hmm_device_lock); -static struct class *hmm_device_class; -static dev_t hmm_device_devt; - -static void hmm_device_release(struct device *device) -{ - struct hmm_device *hmm_device; - - hmm_device = container_of(device, struct hmm_device, device); - spin_lock(&hmm_device_lock); - clear_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - kfree(hmm_device); -} - -struct hmm_device *hmm_device_new(void *drvdata) -{ - struct hmm_device *hmm_device; - - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); - if (!hmm_device) - return ERR_PTR(-ENOMEM); - - spin_lock(&hmm_device_lock); - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); - if (hmm_device->minor >= HMM_DEVICE_MAX) { - spin_unlock(&hmm_device_lock); - kfree(hmm_device); - return ERR_PTR(-EBUSY); - } - set_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), - hmm_device->minor); - hmm_device->device.release = hmm_device_release; - dev_set_drvdata(&hmm_device->device, drvdata); - hmm_device->device.class = hmm_device_class; - device_initialize(&hmm_device->device); - - return hmm_device; -} -EXPORT_SYMBOL(hmm_device_new); - -void hmm_device_put(struct hmm_device *hmm_device) -{ - put_device(&hmm_device->device); -} -EXPORT_SYMBOL(hmm_device_put); - -static int __init hmm_init(void) -{ - int ret; - - ret = alloc_chrdev_region(&hmm_device_devt, 0, - HMM_DEVICE_MAX, - "hmm_device"); - if (ret) - return ret; - - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); - if (IS_ERR(hmm_device_class)) { - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); - return PTR_ERR(hmm_device_class); - } - return 0; -} - -device_initcall(hmm_init); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ |