From 7be1c1a3c7b13fb259bb5159662a7b83622013b8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 11 Oct 2022 20:55:31 +0300 Subject: mm: more vma cache removal Link: https://lkml.kernel.org/r/Y0WuE3Riv4iy5Jx8@localhost.localdomain Fixes: 7964cf8caa4d ("mm: remove vmacache") Signed-off-by: Alexey Dobriyan Acked-by: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 88a043f7235e..e0bb85cf8bdd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,8 +861,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - /* Per-thread vma caching: */ - #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; #endif -- cgit v1.2.3 From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:45 +0800 Subject: mm/damon: move sz_damon_region to damon_sz_region Rename sz_damon_region() to damon_sz_region(), and move it to "include/linux/damon.h", because in many places, we can to use this func. Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 9 ++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ed5470f50bab..620ada094c3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t) return list_first_entry(&t->regions_list, struct damon_region, list); } +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 4de8c7c52979..5b9e0d585aef 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -864,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -static inline unsigned long sz_damon_region(struct damon_region *r) -{ - return r->ar.end - r->ar.start; -} - /* * Merge two adjacent regions into one region */ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { - unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); @@ -904,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && - sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else prev = r; -- cgit v1.2.3 From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++++---- include/linux/migrate.h | 8 ++++++++ lib/test_hmm.c | 7 ++++--- mm/memory.c | 16 ++++++++++++++- mm/migrate.c | 34 +++++++++++++++++++------------- mm/migrate_device.c | 18 ++++++++++++----- 9 files changed, 89 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 598006301620..965c9e9e500b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -580,12 +581,14 @@ out_finalize: static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081..776448bd9fe4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -668,7 +668,7 @@ out_oom: static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +767,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, prange, prange->start, prange->last); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32fa..a5d7e6d22264 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333..9139e5a0b2a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2913,13 +2913,15 @@ retry_write_locked: */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a074..52090d1f9230 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 6a33f6b1b465..e566166b5571 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, struct vm_area_struct *vma; unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns = 0; unsigned long dst_pfns = 0; struct page *rpage; @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 2c7723ea4371..4ad6077164cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_swapin_error_entry(entry)) { diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963..1379e1912772 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed8..8dee38ffcda2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, migrate->fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } -- cgit v1.2.3 From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 1 + lib/test_hmm.c | 2 +- mm/memremap.c | 9 +++++++++ mm/page_alloc.c | 8 ++++++++ 7 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 965c9e9e500b..e2f11f9c3f2a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -718,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - lock_page(dpage); + zone_device_page_init(dpage); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 776448bd9fe4..97a684568ae0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 16356611b5b9..b092988266a6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - lock_page(page); + zone_device_page_init(page); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b..7fcaf3180a5b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e566166b5571..bc2b94991165 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,8 +627,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } + zone_device_page_init(dpage); dpage->zone_device_data = rpage; - lock_page(dpage); return dpage; error: diff --git a/mm/memremap.c b/mm/memremap.c index 25029a474d30..1c2c038f3410 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,8 +505,17 @@ void free_zone_device_page(struct page *page) /* * Reset the page count to 1 to prepare for handing out the page again. */ + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + set_page_count(page, 1); +} + +void zone_device_page_init(struct page *page) +{ set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed..059f6946832f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6819,6 +6819,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* -- cgit v1.2.3 From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:19 +1000 Subject: mm/migrate_device.c: add migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device drivers can use the migrate_vma family of functions to migrate existing private anonymous mappings to device private pages. These pages are backed by memory on the device with drivers being responsible for copying data to and from device memory. Device private pages are freed via the pgmap->page_free() callback when they are unmapped and their refcount drops to zero. Alternatively they may be freed indirectly via migration back to CPU memory in response to a pgmap->migrate_to_ram() callback called whenever the CPU accesses an address mapped to a device private page. In other words drivers cannot control the lifetime of data allocated on the devices and must wait until these pages are freed from userspace. This causes issues when memory needs to reclaimed on the device, either because the device is going away due to a ->release() callback or because another user needs to use the memory. Drivers could use the existing migrate_vma functions to migrate data off the device. However this would require them to track the mappings of each page which is both complicated and not always possible. Instead drivers need to be able to migrate device pages directly so they can free up device memory. To allow that this patch introduces the migrate_device family of functions which are functionally similar to migrate_vma but which skips the initial lookup based on mapping. Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 ++++ mm/migrate_device.c | 89 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 52090d1f9230..3ef77f52a4f0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -210,6 +210,13 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages); +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages); +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages); + #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7707c1d898f5..6fa682eef7a0 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -693,7 +693,7 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -static void migrate_device_pages(unsigned long *src_pfns, +static void __migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages, struct migrate_vma *migrate) { @@ -715,6 +715,9 @@ static void migrate_device_pages(unsigned long *src_pfns, if (!page) { unsigned long addr; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't @@ -722,8 +725,6 @@ static void migrate_device_pages(unsigned long *src_pfns, */ VM_BUG_ON(!migrate); addr = migrate->start + i*PAGE_SIZE; - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; if (!notified) { notified = true; @@ -778,6 +779,22 @@ static void migrate_device_pages(unsigned long *src_pfns, mmu_notifier_invalidate_range_only_end(&range); } +/** + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + /** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information @@ -788,12 +805,22 @@ static void migrate_device_pages(unsigned long *src_pfns, */ void migrate_vma_pages(struct migrate_vma *migrate) { - migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); } EXPORT_SYMBOL(migrate_vma_pages); -static void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { unsigned long i; @@ -837,6 +864,7 @@ static void migrate_device_finalize(unsigned long *src_pfns, } } } +EXPORT_SYMBOL(migrate_device_finalize); /** * migrate_vma_finalize() - restore CPU page table entry @@ -855,6 +883,53 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + /* * Migrate a device coherent page back to normal memory. The caller should have * a reference on page which will be copied to the new page if migration is @@ -885,7 +960,7 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); + migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); migrate_device_finalize(&src_pfn, &dst_pfn, 1); -- cgit v1.2.3