From ea091fa53680030881b56520d731e36d3ff6cdd5 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 4 Mar 2022 17:12:15 +0800 Subject: lib/test_meminit: add checks for the allocation functions alloc_pages(), kmalloc() and vmalloc() are all memory allocation functions which can return NULL when some internal memory failures happen. So it is better to check the return of them to catch the failure in time for better test them. Link: https://lkml.kernel.org/r/tencent_D44A49FFB420EDCCBFB9221C8D14DFE12908@qq.com Signed-off-by: Xiaoke Wang Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- lib/test_meminit.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'lib') diff --git a/lib/test_meminit.c b/lib/test_meminit.c index c95db11a6906..60e1984c060f 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures) size_t size = PAGE_SIZE << order; page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); fill_with_garbage(buf, size); __free_pages(page, order); page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); __free_pages(page, order); return 1; +err: + (*total_failures)++; + return 1; } /* Test the page allocator by calling alloc_pages with different orders. */ @@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures) void *buf; buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; fill_with_garbage(buf, size); kfree(buf); buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); kfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test vmalloc() with given parameters. */ @@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures) void *buf; buf = vmalloc(size); + if (!buf) + goto err; fill_with_garbage(buf, size); vfree(buf); buf = vmalloc(size); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); vfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test kmalloc()/vmalloc() by allocating objects of different sizes. */ -- cgit v1.2.3 From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++++---- include/linux/migrate.h | 8 ++++++++ lib/test_hmm.c | 7 ++++--- mm/memory.c | 16 ++++++++++++++- mm/migrate.c | 34 +++++++++++++++++++------------- mm/migrate_device.c | 18 ++++++++++++----- 9 files changed, 89 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 598006301620..965c9e9e500b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -580,12 +581,14 @@ out_finalize: static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081..776448bd9fe4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -668,7 +668,7 @@ out_oom: static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +767,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, prange, prange->start, prange->last); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32fa..a5d7e6d22264 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333..9139e5a0b2a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2913,13 +2913,15 @@ retry_write_locked: */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a074..52090d1f9230 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 6a33f6b1b465..e566166b5571 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, struct vm_area_struct *vma; unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns = 0; unsigned long dst_pfns = 0; struct page *rpage; @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 2c7723ea4371..4ad6077164cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_swapin_error_entry(entry)) { diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963..1379e1912772 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed8..8dee38ffcda2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, migrate->fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } -- cgit v1.2.3 From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 1 + lib/test_hmm.c | 2 +- mm/memremap.c | 9 +++++++++ mm/page_alloc.c | 8 ++++++++ 7 files changed, 22 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 965c9e9e500b..e2f11f9c3f2a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -718,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - lock_page(dpage); + zone_device_page_init(dpage); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 776448bd9fe4..97a684568ae0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 16356611b5b9..b092988266a6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - lock_page(page); + zone_device_page_init(page); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b..7fcaf3180a5b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e566166b5571..bc2b94991165 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,8 +627,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } + zone_device_page_init(dpage); dpage->zone_device_data = rpage; - lock_page(dpage); return dpage; error: diff --git a/mm/memremap.c b/mm/memremap.c index 25029a474d30..1c2c038f3410 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,8 +505,17 @@ void free_zone_device_page(struct page *page) /* * Reset the page count to 1 to prepare for handing out the page again. */ + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + set_page_count(page, 1); +} + +void zone_device_page_init(struct page *page) +{ set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed..059f6946832f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6819,6 +6819,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* -- cgit v1.2.3 From ad4c365221b0f92f9c24a203119f2bade30c970e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:22 +1000 Subject: hmm-tests: add test for migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://lkml.kernel.org/r/a73cf109de0224cfd118d22be58ddebac3ae2897.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: Jason Gunthorpe Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Sierra Cc: Felix Kuehling Cc: Alex Deucher Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Lyude Paul Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- lib/test_hmm.c | 120 +++++++++++++++++++++++++++------ lib/test_hmm_uapi.h | 1 + tools/testing/selftests/vm/hmm-tests.c | 49 ++++++++++++++ 3 files changed, 149 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/test_hmm.c b/lib/test_hmm.c index bc2b94991165..67e6f83fe0f8 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -100,6 +100,7 @@ struct dmirror { struct dmirror_chunk { struct dev_pagemap pagemap; struct dmirror_device *mdevice; + bool remove; }; /* @@ -192,11 +193,15 @@ static int dmirror_fops_release(struct inode *inode, struct file *filp) return 0; } +static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page) +{ + return container_of(page->pgmap, struct dmirror_chunk, pagemap); +} + static struct dmirror_device *dmirror_page_to_device(struct page *page) { - return container_of(page->pgmap, struct dmirror_chunk, - pagemap)->mdevice; + return dmirror_page_to_chunk(page)->mdevice; } static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) @@ -1218,6 +1223,85 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } +static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk) +{ + unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT; + unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT; + unsigned long npages = end_pfn - start_pfn + 1; + unsigned long i; + unsigned long *src_pfns; + unsigned long *dst_pfns; + + src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL); + dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL); + + migrate_device_range(src_pfns, start_pfn, npages); + for (i = 0; i < npages; i++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(src_pfns[i]); + if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); + lock_page(dpage); + copy_highpage(dpage, spage); + dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); + if (src_pfns[i] & MIGRATE_PFN_WRITE) + dst_pfns[i] |= MIGRATE_PFN_WRITE; + } + migrate_device_pages(src_pfns, dst_pfns, npages); + migrate_device_finalize(src_pfns, dst_pfns, npages); + kfree(src_pfns); + kfree(dst_pfns); +} + +/* Removes free pages from the free list so they can't be re-allocated */ +static void dmirror_remove_free_pages(struct dmirror_chunk *devmem) +{ + struct dmirror_device *mdevice = devmem->mdevice; + struct page *page; + + for (page = mdevice->free_pages; page; page = page->zone_device_data) + if (dmirror_page_to_chunk(page) == devmem) + mdevice->free_pages = page->zone_device_data; +} + +static void dmirror_device_remove_chunks(struct dmirror_device *mdevice) +{ + unsigned int i; + + mutex_lock(&mdevice->devmem_lock); + if (mdevice->devmem_chunks) { + for (i = 0; i < mdevice->devmem_count; i++) { + struct dmirror_chunk *devmem = + mdevice->devmem_chunks[i]; + + spin_lock(&mdevice->lock); + devmem->remove = true; + dmirror_remove_free_pages(devmem); + spin_unlock(&mdevice->lock); + + dmirror_device_evict_chunk(devmem); + memunmap_pages(&devmem->pagemap); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); + kfree(devmem); + } + mdevice->devmem_count = 0; + mdevice->devmem_capacity = 0; + mdevice->free_pages = NULL; + kfree(mdevice->devmem_chunks); + mdevice->devmem_chunks = NULL; + } + mutex_unlock(&mdevice->devmem_lock); +} + static long dmirror_fops_unlocked_ioctl(struct file *filp, unsigned int command, unsigned long arg) @@ -1272,6 +1356,11 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_snapshot(dmirror, &cmd); break; + case HMM_DMIRROR_RELEASE: + dmirror_device_remove_chunks(dmirror->mdevice); + ret = 0; + break; + default: return -EINVAL; } @@ -1326,9 +1415,13 @@ static void dmirror_devmem_free(struct page *page) mdevice = dmirror_page_to_device(page); spin_lock(&mdevice->lock); - mdevice->cfree++; - page->zone_device_data = mdevice->free_pages; - mdevice->free_pages = page; + + /* Return page to our allocator if not freeing the chunk */ + if (!dmirror_page_to_chunk(page)->remove) { + mdevice->cfree++; + page->zone_device_data = mdevice->free_pages; + mdevice->free_pages = page; + } spin_unlock(&mdevice->lock); } @@ -1408,22 +1501,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) static void dmirror_device_remove(struct dmirror_device *mdevice) { - unsigned int i; - - if (mdevice->devmem_chunks) { - for (i = 0; i < mdevice->devmem_count; i++) { - struct dmirror_chunk *devmem = - mdevice->devmem_chunks[i]; - - memunmap_pages(&devmem->pagemap); - if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); - kfree(devmem); - } - kfree(mdevice->devmem_chunks); - } - + dmirror_device_remove_chunks(mdevice); cdev_device_del(&mdevice->cdevice, &mdevice->device); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index e31d58c9034a..8c818a2cf4f6 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -36,6 +36,7 @@ struct hmm_dmirror_cmd { #define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) #define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) #define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_RELEASE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index f2c2c970eeb2..28232adec883 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1054,6 +1054,55 @@ TEST_F(hmm, migrate_fault) hmm_buffer_free(buffer); } +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + /* * Migrate anonymous shared memory to device private memory. */ -- cgit v1.2.3