From 0092908d16c604b8207c2141ec64b0fa4473bb03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:06 +0200 Subject: mm: factor out a devm_request_free_mem_region helper Keep the physical address allocation that hmm_add_device does with the rest of the resource code, and allow future reuse of it without the hmm wrapper. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: Dan Williams Signed-off-by: Jason Gunthorpe --- kernel/resource.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 158f04ec1d4f..d22423e85cf8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) -- cgit v1.2.3 From 3ed2dcdf54d5bf1f9823b5faf1a702e7cee53982 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:07 +0200 Subject: memremap: validate the pagemap type passed to devm_memremap_pages Most pgmap types are only supported when certain config options are enabled. Check for a type that is valid for the current configuration before setting up the pagemap. For this the usage of the 0 type for device dax gets replaced with an explicit MEMORY_DEVICE_DEVDAX type. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 1 + include/linux/memremap.h | 8 ++++++++ kernel/memremap.c | 22 ++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 8465d12fecba..79014baa782d 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -468,6 +468,7 @@ int dev_dax_probe(struct device *dev) dev_dax->pgmap.ref = &dev_dax->ref; dev_dax->pgmap.kill = dev_dax_percpu_kill; dev_dax->pgmap.cleanup = dev_dax_percpu_exit; + dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 995c62c5a48b..0c86f2c5ac9c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -45,13 +45,21 @@ struct vmem_altmap { * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * + * MEMORY_DEVICE_DEVDAX: + * Host memory that has similar access semantics as System RAM i.e. DMA + * coherent and supports page pinning. In contrast to + * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax + * character device. + * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { + /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_FS_DAX, + MEMORY_DEVICE_DEVDAX, MEMORY_DEVICE_PCI_P2PDMA, }; diff --git a/kernel/memremap.c b/kernel/memremap.c index 6e1970719dc2..abda62d1e5a3 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -157,6 +157,28 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_PCI_P2PDMA: + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:08 +0200 Subject: memremap: move dev_pagemap callbacks into a separate structure The dev_pagemap is a growing too many callbacks. Move them into a separate ops structure so that they are not duplicated for multiple instances, and an attacker can't easily overwrite them. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 11 +++++++---- drivers/dax/pmem/core.c | 2 +- drivers/nvdimm/pmem.c | 19 +++++++++++-------- drivers/pci/p2pdma.c | 8 ++++++-- include/linux/memremap.h | 36 ++++++++++++++++++++---------------- kernel/memremap.c | 18 +++++++++--------- mm/hmm.c | 10 +++++++--- tools/testing/nvdimm/test/iomap.c | 7 ++++--- 8 files changed, 65 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 79014baa782d..f390083a64d7 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref) percpu_ref_exit(ref); } -static void dev_dax_percpu_kill(struct percpu_ref *data) +static void dev_dax_percpu_kill(struct percpu_ref *ref) { - struct percpu_ref *ref = data; struct dev_dax *dev_dax = ref_to_dev_dax(ref); dev_dbg(&dev_dax->dev, "%s\n", __func__); @@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static const struct dev_pagemap_ops dev_dax_pagemap_ops = { + .kill = dev_dax_percpu_kill, + .cleanup = dev_dax_percpu_exit, +}; + int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev) return rc; dev_dax->pgmap.ref = &dev_dax->ref; - dev_dax->pgmap.kill = dev_dax_percpu_kill; - dev_dax->pgmap.cleanup = dev_dax_percpu_exit; dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX; + dev_dax->pgmap.ops = &dev_dax_pagemap_ops; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index f9f51786d556..6eb6dfdf19bf 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) struct dev_dax *dev_dax; struct nd_namespace_io *nsio; struct dax_region *dax_region; - struct dev_pagemap pgmap = { 0 }; + struct dev_pagemap pgmap = { }; struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 24d7fe7c74ed..c2449af2b388 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void __pmem_release_queue(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct percpu_ref *ref) { struct request_queue *q; @@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref) static void pmem_release_queue(void *ref) { - __pmem_release_queue(ref); + pmem_pagemap_cleanup(ref); } -static void pmem_freeze_queue(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct percpu_ref *ref) { struct request_queue *q; @@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap) dev_pagemap_put_ops(); } -static void fsdax_pagefree(struct page *page, void *data) +static void pmem_pagemap_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); } +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .page_free = pmem_pagemap_page_free, + .kill = pmem_pagemap_kill, + .cleanup = pmem_pagemap_cleanup, +}; + static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) { dev_pagemap_get_ops(); if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) return -ENOMEM; pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->page_free = fsdax_pagefree; - + pgmap->ops = &fsdax_pagemap_ops; return 0; } @@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; - pmem->pgmap.kill = pmem_freeze_queue; - pmem->pgmap.cleanup = __pmem_release_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index a4994aa3acc0..fb039259d463 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -153,6 +153,11 @@ out: return error; } +static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = { + .kill = pci_p2pdma_percpu_kill, + .cleanup = pci_p2pdma_percpu_cleanup, +}; + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); - pgmap->kill = pci_p2pdma_percpu_kill; - pgmap->cleanup = pci_p2pdma_percpu_cleanup; + pgmap->ops = &pci_p2pdma_pagemap_ops; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0c86f2c5ac9c..919755f48c7e 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -63,41 +63,45 @@ enum memory_type { MEMORY_DEVICE_PCI_P2PDMA, }; -/* - * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief - * explanation in include/linux/memory_hotplug.h. - * - * The page_free() callback is called once the page refcount reaches 1 - * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. - * This allows the device driver to implement its own memory management.) - */ -typedef void (*dev_page_free_t)(struct page *page, void *data); +struct dev_pagemap_ops { + /* + * Called once the page refcount reaches 1. (ZONE_DEVICE pages never + * reach 0 refcount unless there is a refcount bug. This allows the + * device driver to implement its own memory management.) + */ + void (*page_free)(struct page *page, void *data); + + /* + * Transition the refcount in struct dev_pagemap to the dead state. + */ + void (*kill)(struct percpu_ref *ref); + + /* + * Wait for refcount in struct dev_pagemap to be idle and reap it. + */ + void (*cleanup)(struct percpu_ref *ref); +}; /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping - * @kill: callback to transition @ref to the dead state - * @cleanup: callback to wait for @ref to be idle and reap it * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @ops: method table */ struct dev_pagemap { - dev_page_free_t page_free; struct vmem_altmap altmap; bool altmap_valid; struct resource res; struct percpu_ref *ref; - void (*kill)(struct percpu_ref *ref); - void (*cleanup)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; + const struct dev_pagemap_ops *ops; }; #ifdef CONFIG_ZONE_DEVICE diff --git a/kernel/memremap.c b/kernel/memremap.c index abda62d1e5a3..0824237ef979 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data) * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized - * by the caller before passing it to this function + * 1/ At a minimum the res, ref and type and ops members of @pgmap must be + * initialized by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true @@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup) { WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); } @@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); - + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 48574f8485bb..583a02a16872 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } +static const struct dev_pagemap_ops hmm_pagemap_ops = { + .page_free = hmm_devmem_free, + .kill = hmm_devmem_ref_kill, + .cleanup = hmm_devmem_ref_exit, +}; + /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 076df22e4bda..cf3f064a697d 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup); - pgmap->kill(pgmap->ref); - pgmap->cleanup(pgmap->ref); + WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || + !pgmap->ops->cleanup); + pgmap->ops->kill(pgmap->ref); + pgmap->ops->cleanup(pgmap->ref); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:09 +0200 Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup Passing the actual typed structure leads to more understandable code vs just passing the ref member. Reported-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/dax/device.c | 12 ++++++------ drivers/nvdimm/pmem.c | 18 +++++++++--------- drivers/pci/p2pdma.c | 9 +++++---- include/linux/memremap.h | 4 ++-- kernel/memremap.c | 8 ++++---- mm/hmm.c | 10 +++++----- tools/testing/nvdimm/test/iomap.c | 4 ++-- 7 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index f390083a64d7..b5257038c188 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref) complete(&dev_dax->cmp); } -static void dev_dax_percpu_exit(struct percpu_ref *ref) +static void dev_dax_percpu_exit(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); wait_for_completion(&dev_dax->cmp); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void dev_dax_percpu_kill(struct percpu_ref *ref) +static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) { - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c2449af2b388..9dac48359353 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = { NULL, }; -static void pmem_pagemap_cleanup(struct percpu_ref *ref) +static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_cleanup_queue(q); } -static void pmem_release_queue(void *ref) +static void pmem_release_queue(void *pgmap) { - pmem_pagemap_cleanup(ref); + pmem_pagemap_cleanup(pgmap); } -static void pmem_pagemap_kill(struct percpu_ref *ref) +static void pmem_pagemap_kill(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q = + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_freeze_queue_start(q); } @@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev, memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); } else { if (devm_add_action_or_reset(dev, pmem_release_queue, - &q->q_usage_counter)) + &pmem->pgmap)) return -ENOMEM; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index fb039259d463..fa6249e4ed5f 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref) complete(&p2p_pgmap->ref_done); } -static void pci_p2pdma_percpu_kill(struct percpu_ref *ref) +static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } -static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref) +static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap) { - struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref); + struct p2pdma_pagemap *p2p_pgmap = + container_of(pgmap, struct p2pdma_pagemap, pgmap); wait_for_completion(&p2p_pgmap->ref_done); percpu_ref_exit(&p2p_pgmap->ref); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 919755f48c7e..b8666a0d8665 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -74,12 +74,12 @@ struct dev_pagemap_ops { /* * Transition the refcount in struct dev_pagemap to the dead state. */ - void (*kill)(struct percpu_ref *ref); + void (*kill)(struct dev_pagemap *pgmap); /* * Wait for refcount in struct dev_pagemap to be idle and reap it. */ - void (*cleanup)(struct percpu_ref *ref); + void (*cleanup)(struct dev_pagemap *pgmap); }; /** diff --git a/kernel/memremap.c b/kernel/memremap.c index 0824237ef979..00c1ceb60c19 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/mm/hmm.c b/mm/hmm.c index 583a02a16872..987793fba923 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref) complete(&devmem->completion); } -static void hmm_devmem_ref_exit(struct percpu_ref *ref) +static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap) { struct hmm_devmem *devmem; - devmem = container_of(ref, struct hmm_devmem, ref); + devmem = container_of(pgmap, struct hmm_devmem, pagemap); wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); + percpu_ref_exit(pgmap->ref); } -static void hmm_devmem_ref_kill(struct percpu_ref *ref) +static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index cf3f064a697d..82f901569e06 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap) WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap->ref); - pgmap->ops->cleanup(pgmap->ref); + pgmap->ops->kill(pgmap); + pgmap->ops->cleanup(pgmap); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From f6a55e1a3fe6b3bb294a80a05437fcf86488d819 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:10 +0200 Subject: memremap: lift the devmap_enable manipulation into devm_memremap_pages Just check if there is a ->page_free operation set and take care of the static key enable, as well as the put using device managed resources. Also check that a ->page_free is provided for the pgmaps types that require it, and check for a valid type as well while we are at it. Note that this also fixes the fact that hmm never called dev_pagemap_put_ops and thus would leave the slow path enabled forever, even after a device driver unload or disable. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/nvdimm/pmem.c | 23 ++++---------------- include/linux/mm.h | 10 --------- kernel/memremap.c | 59 ++++++++++++++++++++++++++++++++------------------- mm/hmm.c | 2 -- 4 files changed, 41 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 9dac48359353..48767171a4df 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -334,11 +334,6 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } -static void pmem_release_pgmap_ops(void *__pgmap) -{ - dev_pagemap_put_ops(); -} - static void pmem_pagemap_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); @@ -350,16 +345,6 @@ static const struct dev_pagemap_ops fsdax_pagemap_ops = { .cleanup = pmem_pagemap_cleanup, }; -static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) -{ - dev_pagemap_get_ops(); - if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) - return -ENOMEM; - pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->ops = &fsdax_pagemap_ops; - return 0; -} - static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -415,8 +400,8 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { - if (setup_pagemap_fsdax(dev, &pmem->pgmap)) - return -ENOMEM; + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -428,8 +413,8 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); pmem->pgmap.altmap_valid = false; - if (setup_pagemap_fsdax(dev, &pmem->pgmap)) - return -ENOMEM; + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7399f9f08de6..2425f4167ec2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -932,8 +932,6 @@ static inline bool is_zone_device_page(const struct page *page) #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void dev_pagemap_get_ops(void); -void dev_pagemap_put_ops(void); void __put_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool put_devmap_managed_page(struct page *page) @@ -973,14 +971,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page) #endif /* CONFIG_PCI_P2PDMA */ #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline void dev_pagemap_get_ops(void) -{ -} - -static inline void dev_pagemap_put_ops(void) -{ -} - static inline bool put_devmap_managed_page(struct page *page) { return false; diff --git a/kernel/memremap.c b/kernel/memremap.c index 00c1ceb60c19..3219a4c91d07 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -17,6 +17,35 @@ static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_managed_enable; + +static void devmap_managed_enable_put(void *data) +{ + if (atomic_dec_and_test(&devmap_managed_enable)) + static_branch_disable(&devmap_managed_key); +} + +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return -EINVAL; + } + + if (atomic_inc_return(&devmap_managed_enable) == 1) + static_branch_enable(&devmap_managed_key); + return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); +} +#else +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + return -EINVAL; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -156,6 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + bool need_devmap_managed = true; switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -173,6 +203,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; case MEMORY_DEVICE_DEVDAX: case MEMORY_DEVICE_PCI_P2PDMA: + need_devmap_managed = false; break; default: WARN(1, "Invalid pgmap type %d\n", pgmap->type); @@ -185,6 +216,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return ERR_PTR(-EINVAL); } + if (need_devmap_managed) { + error = devmap_managed_enable_get(dev, pgmap); + if (error) + return ERR_PTR(error); + } + align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; @@ -351,28 +388,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); diff --git a/mm/hmm.c b/mm/hmm.c index 987793fba923..5b0bd5f6a74f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1415,8 +1415,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, void *result; int ret; - dev_pagemap_get_ops(); - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 897e6365cda6ba6356e83a3aaa68dec82ef4c548 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:11 +0200 Subject: memremap: add a migrate_to_ram method to struct dev_pagemap_ops This replaces the hacky ->fault callback, which is currently directly called from common code through a hmm specific data structure as an exercise in layering violations. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- include/linux/hmm.h | 6 ------ include/linux/memremap.h | 6 ++++++ include/linux/swapops.h | 15 --------------- kernel/memremap.c | 35 ++++------------------------------- mm/hmm.c | 13 +++++-------- mm/memory.c | 9 ++------- 6 files changed, 17 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 44a5ac738bb5..ba19c19e24ed 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -692,11 +692,6 @@ struct hmm_devmem_ops { * chunk, as an optimization. It must, however, prioritize the faulting address * over all the others. */ -typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); struct hmm_devmem { struct completion completion; @@ -707,7 +702,6 @@ struct hmm_devmem { struct dev_pagemap pagemap; const struct hmm_devmem_ops *ops; struct percpu_ref ref; - dev_page_fault_t page_fault; }; /* diff --git a/include/linux/memremap.h b/include/linux/memremap.h index b8666a0d8665..ac985bd03a7f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -80,6 +80,12 @@ struct dev_pagemap_ops { * Wait for refcount in struct dev_pagemap to be idle and reap it. */ void (*cleanup)(struct dev_pagemap *pgmap); + + /* + * Used for private (un-addressable) device memory only. Must migrate + * the page back to a CPU accessible page. + */ + vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); }; /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4d961668e5fc..15bdb6fe71e5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return pfn_to_page(swp_offset(entry)); } - -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp); #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_device_private_entry(struct page *page, bool write) { @@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return NULL; } - -static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - return VM_FAULT_SIGBUS; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION diff --git a/kernel/memremap.c b/kernel/memremap.c index 3219a4c91d07..c06a5487dda7 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,7 +11,6 @@ #include #include #include -#include static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -46,36 +45,6 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; - - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); - - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); -} -#endif /* CONFIG_DEVICE_PRIVATE */ - static void pgmap_array_delete(struct resource *res) { xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), @@ -193,6 +162,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) WARN(1, "Device private memory not supported\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || diff --git a/mm/hmm.c b/mm/hmm.c index 5b0bd5f6a74f..96633ee066d8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1366,15 +1366,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) percpu_ref_kill(pgmap->ref); } -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) +static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf) { - struct hmm_devmem *devmem = page->pgmap->data; + struct hmm_devmem *devmem = vmf->page->pgmap->data; - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); + return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, + vmf->flags, vmf->pmd); } static void hmm_devmem_free(struct page *page, void *data) @@ -1388,6 +1385,7 @@ static const struct dev_pagemap_ops hmm_pagemap_ops = { .page_free = hmm_devmem_free, .kill = hmm_devmem_ref_kill, .cleanup = hmm_devmem_ref_exit, + .migrate_to_ram = hmm_devmem_migrate_to_ram, }; /* @@ -1438,7 +1436,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; diff --git a/mm/memory.c b/mm/memory.c index 2d14f4c7e152..d437ccdb210c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2748,13 +2748,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - /* - * For un-addressable device memory we call the pgmap - * fault handler callback. The callback must migrate - * the page back to some CPU accessible page. - */ - ret = device_private_entry_fault(vma, vmf->address, entry, - vmf->flags, vmf->pmd); + vmf->page = device_private_entry_to_page(entry); + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { -- cgit v1.2.3 From 80a72d0af05ae97a8b106c172e431072ba587492 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:12 +0200 Subject: memremap: remove the data field in struct dev_pagemap struct dev_pagemap is always embedded into a containing structure, so there is no need to an additional private data field. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- drivers/nvdimm/pmem.c | 2 +- include/linux/memremap.h | 3 +-- kernel/memremap.c | 2 +- mm/hmm.c | 9 +++++---- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 48767171a4df..093408ce40ad 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -334,7 +334,7 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } -static void pmem_pagemap_page_free(struct page *page, void *data) +static void pmem_pagemap_page_free(struct page *page) { wake_up_var(&page->_refcount); } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index ac985bd03a7f..336eca601dad 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -69,7 +69,7 @@ struct dev_pagemap_ops { * reach 0 refcount unless there is a refcount bug. This allows the * device driver to implement its own memory management.) */ - void (*page_free)(struct page *page, void *data); + void (*page_free)(struct page *page); /* * Transition the refcount in struct dev_pagemap to the dead state. @@ -104,7 +104,6 @@ struct dev_pagemap { struct resource res; struct percpu_ref *ref; struct device *dev; - void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; diff --git a/kernel/memremap.c b/kernel/memremap.c index c06a5487dda7..6c3dbb692037 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -376,7 +376,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->ops->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 96633ee066d8..36e25cdbdac1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1368,15 +1368,17 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf) { - struct hmm_devmem *devmem = vmf->page->pgmap->data; + struct hmm_devmem *devmem = + container_of(vmf->page->pgmap, struct hmm_devmem, pagemap); return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, vmf->flags, vmf->pmd); } -static void hmm_devmem_free(struct page *page, void *data) +static void hmm_devmem_free(struct page *page) { - struct hmm_devmem *devmem = data; + struct hmm_devmem *devmem = + container_of(page->pgmap, struct hmm_devmem, pagemap); devmem->ops->free(devmem, page); } @@ -1442,7 +1444,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) -- cgit v1.2.3 From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:13 +0200 Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag Add a flags field to struct dev_pagemap to replace the altmap_valid boolean to be a little more extensible. Also add a pgmap_altmap() helper to find the optional altmap and clean up the code using the altmap using it. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- arch/powerpc/mm/mem.c | 10 +--------- arch/x86/mm/init_64.c | 8 ++------ drivers/nvdimm/pfn_devs.c | 3 +-- drivers/nvdimm/pmem.c | 1 - include/linux/memremap.h | 12 +++++++++++- kernel/memremap.c | 26 ++++++++++---------------- mm/hmm.c | 1 - mm/memory_hotplug.c | 6 ++---- mm/page_alloc.c | 5 ++--- 9 files changed, 29 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2540d3b2588c..a2923c5c1982 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page; + struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); int ret; - /* - * If we have an altmap then we need to skip over any reserved PFNs - * when querying the zone. - */ - page = pfn_to_page(start_pfn); - if (altmap) - page += vmem_altmap_offset(altmap); - __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0f01c7b1d217..08bbf648827b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct page *page = pfn_to_page(start_pfn); - struct zone *zone; + struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap); + struct zone *zone = page_zone(page); - /* With altmap the first mapped page is offset from @start */ - if (altmap) - page += vmem_altmap_offset(altmap); - zone = page_zone(page); __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); } diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0f81fc56bbfd..55fb6b7433ed 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) if (offset < reserve) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); - pgmap->altmap_valid = false; } else if (nd_pfn->mode == PFN_MODE_PMEM) { nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) - offset) / PAGE_SIZE); @@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) memcpy(altmap, &__altmap, sizeof(*altmap)); altmap->free = PHYS_PFN(offset - reserve); altmap->alloc = 0; - pgmap->altmap_valid = true; + pgmap->flags |= PGMAP_ALTMAP_VALID; } else return -ENXIO; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 093408ce40ad..e7d8cc9f41e8 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -412,7 +412,6 @@ static int pmem_attach_disk(struct device *dev, bb_res.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); - pmem->pgmap.altmap_valid = false; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 336eca601dad..e25685b878e9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -88,6 +88,8 @@ struct dev_pagemap_ops { vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); }; +#define PGMAP_ALTMAP_VALID (1 << 0) + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations @@ -96,19 +98,27 @@ struct dev_pagemap_ops { * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table */ struct dev_pagemap { struct vmem_altmap altmap; - bool altmap_valid; struct resource res; struct percpu_ref *ref; struct device *dev; enum memory_type type; + unsigned int flags; u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; }; +static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) +{ + if (pgmap->flags & PGMAP_ALTMAP_VALID) + return &pgmap->altmap; + return NULL; +} + #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/kernel/memremap.c b/kernel/memremap.c index 6c3dbb692037..eee490e7d7e1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data) * 1/ At a minimum the res, ref and type and ops members of @pgmap must be * initialized by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped * at devm_memremap_pages_release() time, or if this routine fails. @@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; struct mhp_restrictions restrictions = { /* * We do not want any optional features only our own memmap */ - .altmap = altmap, + .altmap = pgmap_altmap(pgmap), }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) diff --git a/mm/hmm.c b/mm/hmm.c index 36e25cdbdac1..e4470462298f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1442,7 +1442,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; devmem->pagemap.ops = &hmm_pagemap_ops; - devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; result = devm_memremap_pages(devmem->device, &devmem->pagemap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e096c987d261..6166ba5a15f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } + if (is_dev_zone(zone)) + map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d66bc8abe0af..17a39d40a556 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone, { unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ - if (pgmap->altmap_valid) { - struct vmem_altmap *altmap = &pgmap->altmap; - + if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } -- cgit v1.2.3 From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 26 Jun 2019 14:27:14 +0200 Subject: memremap: provide an optional internal refcount in struct dev_pagemap Provide an internal refcounting logic if no ->ref field is provided in the pagemap passed into devm_memremap_pages so that callers don't have to reinvent it poorly. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Jason Gunthorpe --- include/linux/memremap.h | 4 +++ kernel/memremap.c | 64 +++++++++++++++++++++++++++++++-------- tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++-------- 3 files changed, 101 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e25685b878e9..f8a5b2a19945 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -95,6 +95,8 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping + * @internal_ref: internal reference if @ref is not provided by the caller + * @done: completion for @internal_ref * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -105,6 +107,8 @@ struct dev_pagemap { struct vmem_altmap altmap; struct resource res; struct percpu_ref *ref; + struct percpu_ref internal_ref; + struct completion done; struct device *dev; enum memory_type type; unsigned int flags; diff --git a/kernel/memremap.c b/kernel/memremap.c index eee490e7d7e1..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data) static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) { - if (!pgmap->ops->page_free) { + if (!pgmap->ops || !pgmap->ops->page_free) { WARN(1, "Missing page_free method\n"); return -EINVAL; } @@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); - pgmap->ops->cleanup(pgmap); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type and ops members of @pgmap must be - * initialized by the caller before passing it to this function + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped - * at devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) break; } - if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup) { - WARN(1, "Missing reference count teardown definition\n"); - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } if (need_devmap_managed) { @@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 82f901569e06..cd040b5abffe 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill || - !pgmap->ops->cleanup); - pgmap->ops->kill(pgmap); - pgmap->ops->cleanup(pgmap); + WARN_ON(!pgmap || !pgmap->ref); + + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); + + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { + int error; resource_size_t offset = pgmap->res.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); - if (nfit_res) { - int rc; - - rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); - if (rc) - return ERR_PTR(rc); - return nfit_res->buf + offset - nfit_res->res.start; + if (!nfit_res) + return devm_memremap_pages(dev, pgmap); + + pgmap->dev = dev; + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } } - return devm_memremap_pages(dev, pgmap); + + error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); + if (error) + return ERR_PTR(error); + return nfit_res->buf + offset - nfit_res->res.start; } EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); -- cgit v1.2.3