diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-05 18:49:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-05 18:49:20 -0700 |
commit | 53ef7d0e208fa38c3f63d287e0c3ab174f1e1235 (patch) | |
tree | 7d437edf73ef6deb0d77ce291aa25f041837d056 /drivers | |
parent | c6a677c6f37bb7abc85ba7e3465e82b9f7eb1d91 (diff) | |
parent | 736163671bcb163fc82600b46c83dfa89d532d95 (diff) | |
download | lwn-53ef7d0e208fa38c3f63d287e0c3ab174f1e1235.tar.gz lwn-53ef7d0e208fa38c3f63d287e0c3ab174f1e1235.zip |
Merge tag 'libnvdimm-for-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm updates from Dan Williams:
"The bulk of this has been in multiple -next releases. There were a few
late breaking fixes and small features that got added in the last
couple days, but the whole set has received a build success
notification from the kbuild robot.
Change summary:
- Region media error reporting: A libnvdimm region device is the
parent to one or more namespaces. To date, media errors have been
reported via the "badblocks" attribute attached to pmem block
devices for namespaces in "raw" or "memory" mode. Given that
namespaces can be in "device-dax" or "btt-sector" mode this new
interface reports media errors generically, i.e. independent of
namespace modes or state.
This subsequently allows userspace tooling to craft "ACPI 6.1
Section 9.20.7.6 Function Index 4 - Clear Uncorrectable Error"
requests and submit them via the ioctl path for NVDIMM root bus
devices.
- Introduce 'struct dax_device' and 'struct dax_operations': Prompted
by a request from Linus and feedback from Christoph this allows for
dax capable drivers to publish their own custom dax operations.
This fixes the broken assumption that all dax operations are
related to a persistent memory device, and makes it easier for
other architectures and platforms to add customized persistent
memory support.
- 'libnvdimm' core updates: A new "deep_flush" sysfs attribute is
available for storage appliance applications to manually trigger
memory controllers to drain write-pending buffers that would
otherwise be flushed automatically by the platform ADR
(asynchronous-DRAM-refresh) mechanism at a power loss event.
Support for "locked" DIMMs is included to prevent namespaces from
surfacing when the namespace label data area is locked. Finally,
fixes for various reported deadlocks and crashes, also tagged for
-stable.
- ACPI / nfit driver updates: General updates of the nfit driver to
add DSM command overrides, ACPI 6.1 health state flags support, DSM
payload debug available by default, and various fixes.
Acknowledgements that came after the branch was pushed:
- commmit 565851c972b5 "device-dax: fix sysfs attribute deadlock":
Tested-by: Yi Zhang <yizhan@redhat.com>
- commit 23f498448362 "libnvdimm: rework region badblocks clearing"
Tested-by: Toshi Kani <toshi.kani@hpe.com>"
* tag 'libnvdimm-for-4.12' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (52 commits)
libnvdimm, pfn: fix 'npfns' vs section alignment
libnvdimm: handle locked label storage areas
libnvdimm: convert NDD_ flags to use bitops, introduce NDD_LOCKED
brd: fix uninitialized use of brd->dax_dev
block, dax: use correct format string in bdev_dax_supported
device-dax: fix sysfs attribute deadlock
libnvdimm: restore "libnvdimm: band aid btt vs clear poison locking"
libnvdimm: fix nvdimm_bus_lock() vs device_lock() ordering
libnvdimm: rework region badblocks clearing
acpi, nfit: kill ACPI_NFIT_DEBUG
libnvdimm: fix clear length of nvdimm_forget_poison()
libnvdimm, pmem: fix a NULL pointer BUG in nd_pmem_notify
libnvdimm, region: sysfs trigger for nvdimm_flush()
libnvdimm: fix phys_addr for nvdimm_clear_poison
x86, dax, pmem: remove indirection around memcpy_from_pmem()
block: remove block_device_operations ->direct_access()
block, dax: convert bdev_dax_supported() to dax_direct_access()
filesystem-dax: convert to dax_direct_access()
Revert "block: use DAX for partition table reads"
ext2, ext4, xfs: retrieve dax_device for iomap operations
...
Diffstat (limited to 'drivers')
39 files changed, 1385 insertions, 614 deletions
diff --git a/drivers/Makefile b/drivers/Makefile index 8f8bdc9e3d29..903b19199b69 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT) += parport/ obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ -obj-$(CONFIG_DEV_DAX) += dax/ +obj-$(CONFIG_DAX) += dax/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig index dd0d53c52552..6d3351452ea2 100644 --- a/drivers/acpi/nfit/Kconfig +++ b/drivers/acpi/nfit/Kconfig @@ -12,15 +12,3 @@ config ACPI_NFIT To compile this driver as a module, choose M here: the module will be called nfit. - -config ACPI_NFIT_DEBUG - bool "NFIT DSM debug" - depends on ACPI_NFIT - depends on DYNAMIC_DEBUG - default n - help - Enabling this option causes the nfit driver to dump the - input and output buffers of _DSM operations on the ACPI0012 - device and its children. This can be very verbose, so leave - it disabled unless you are debugging a hardware / firmware - issue. diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index c8ea9d698cd0..656acb5d7166 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -49,7 +49,16 @@ MODULE_PARM_DESC(scrub_overflow_abort, static bool disable_vendor_specific; module_param(disable_vendor_specific, bool, S_IRUGO); MODULE_PARM_DESC(disable_vendor_specific, - "Limit commands to the publicly specified set\n"); + "Limit commands to the publicly specified set"); + +static unsigned long override_dsm_mask; +module_param(override_dsm_mask, ulong, S_IRUGO); +MODULE_PARM_DESC(override_dsm_mask, "Bitmask of allowed NVDIMM DSM functions"); + +static int default_dsm_family = -1; +module_param(default_dsm_family, int, S_IRUGO); +MODULE_PARM_DESC(default_dsm_family, + "Try this DSM type first when identifying NVDIMM family"); LIST_HEAD(acpi_descs); DEFINE_MUTEX(acpi_desc_lock); @@ -175,14 +184,29 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status) return 0; } +static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status) +{ + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + if (status >> 16 & ND_CONFIG_LOCKED) + return -EACCES; + break; + default: + break; + } + + /* all other non-zero status results in an error */ + if (status) + return -EIO; + return 0; +} + static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd, u32 status) { if (!nvdimm) return xlat_bus_status(buf, cmd, status); - if (status) - return -EIO; - return 0; + return xlat_nvdimm_status(buf, cmd, status); } int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, @@ -259,14 +283,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, in_buf.buffer.length = call_pkg->nd_size_in; } - if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { - dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n", - __func__, dimm_name, cmd, func, - in_buf.buffer.length); - print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, + dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n", + __func__, dimm_name, cmd, func, in_buf.buffer.length); + print_hex_dump_debug("nvdimm in ", DUMP_PREFIX_OFFSET, 4, 4, in_buf.buffer.pointer, min_t(u32, 256, in_buf.buffer.length), true); - } out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj); if (!out_obj) { @@ -298,13 +319,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, goto out; } - if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { - dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, - dimm_name, cmd_name, out_obj->buffer.length); - print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, - 4, out_obj->buffer.pointer, min_t(u32, 128, - out_obj->buffer.length), true); - } + dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, dimm_name, + cmd_name, out_obj->buffer.length); + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, 4, + out_obj->buffer.pointer, + min_t(u32, 128, out_obj->buffer.length), true); for (i = 0, offset = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, @@ -448,9 +467,9 @@ static bool add_memdev(struct acpi_nfit_desc *acpi_desc, INIT_LIST_HEAD(&nfit_memdev->list); memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev)); list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); - dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n", + dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d flags: %#x\n", __func__, memdev->device_handle, memdev->range_index, - memdev->region_index); + memdev->region_index, memdev->flags); return true; } @@ -729,28 +748,38 @@ static void nfit_mem_init_bdw(struct acpi_nfit_desc *acpi_desc, } } -static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, +static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_system_address *spa) { struct nfit_mem *nfit_mem, *found; struct nfit_memdev *nfit_memdev; - int type = nfit_spa_type(spa); + int type = spa ? nfit_spa_type(spa) : 0; switch (type) { case NFIT_SPA_DCR: case NFIT_SPA_PM: break; default: - return 0; + if (spa) + return 0; } + /* + * This loop runs in two modes, when a dimm is mapped the loop + * adds memdev associations to an existing dimm, or creates a + * dimm. In the unmapped dimm case this loop sweeps for memdev + * instances with an invalid / zero range_index and adds those + * dimms without spa associations. + */ list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct nfit_flush *nfit_flush; struct nfit_dcr *nfit_dcr; u32 device_handle; u16 dcr; - if (nfit_memdev->memdev->range_index != spa->range_index) + if (spa && nfit_memdev->memdev->range_index != spa->range_index) + continue; + if (!spa && nfit_memdev->memdev->range_index) continue; found = NULL; dcr = nfit_memdev->memdev->region_index; @@ -835,14 +864,15 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, break; } nfit_mem_init_bdw(acpi_desc, nfit_mem, spa); - } else { + } else if (type == NFIT_SPA_PM) { /* * A single dimm may belong to multiple SPA-PM * ranges, record at least one in addition to * any SPA-DCR range. */ nfit_mem->memdev_pmem = nfit_memdev->memdev; - } + } else + nfit_mem->memdev_dcr = nfit_memdev->memdev; } return 0; @@ -866,6 +896,8 @@ static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b) static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) { struct nfit_spa *nfit_spa; + int rc; + /* * For each SPA-DCR or SPA-PMEM address range find its @@ -876,13 +908,20 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) * BDWs are optional. */ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { - int rc; - - rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa); + rc = __nfit_mem_init(acpi_desc, nfit_spa->spa); if (rc) return rc; } + /* + * If a DIMM has failed to be mapped into SPA there will be no + * SPA entries above. Find and register all the unmapped DIMMs + * for reporting and recovery purposes. + */ + rc = __nfit_mem_init(acpi_desc, NULL); + if (rc) + return rc; + list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); return 0; @@ -1237,12 +1276,14 @@ static ssize_t flags_show(struct device *dev, { u16 flags = to_nfit_memdev(dev)->flags; - return sprintf(buf, "%s%s%s%s%s\n", + return sprintf(buf, "%s%s%s%s%s%s%s\n", flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save_fail " : "", flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore_fail " : "", flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush_fail " : "", flags & ACPI_NFIT_MEM_NOT_ARMED ? "not_armed " : "", - flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : ""); + flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart_event " : "", + flags & ACPI_NFIT_MEM_MAP_FAILED ? "map_fail " : "", + flags & ACPI_NFIT_MEM_HEALTH_ENABLED ? "smart_notify " : ""); } static DEVICE_ATTR_RO(flags); @@ -1290,8 +1331,16 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, struct device *dev = container_of(kobj, struct device, kobj); struct nvdimm *nvdimm = to_nvdimm(dev); - if (!to_nfit_dcr(dev)) + if (!to_nfit_dcr(dev)) { + /* Without a dcr only the memdev attributes can be surfaced */ + if (a == &dev_attr_handle.attr || a == &dev_attr_phys_id.attr + || a == &dev_attr_flags.attr + || a == &dev_attr_family.attr + || a == &dev_attr_dsm_mask.attr) + return a->mode; return 0; + } + if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1) return 0; return a->mode; @@ -1368,6 +1417,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, unsigned long dsm_mask; const u8 *uuid; int i; + int family = -1; /* nfit test assumes 1:1 relationship between commands and dsms */ nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en; @@ -1398,11 +1448,14 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, */ for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) - break; + if (family < 0 || i == default_dsm_family) + family = i; /* limit the supported commands to those that are publicly documented */ - nfit_mem->family = i; - if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { + nfit_mem->family = family; + if (override_dsm_mask && !disable_vendor_specific) + dsm_mask = override_dsm_mask; + else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { dsm_mask = 0x3fe; if (disable_vendor_specific) dsm_mask &= ~(1 << ND_CMD_VENDOR); @@ -1462,6 +1515,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; + struct nfit_memdev *nfit_memdev; u32 device_handle; u16 mem_flags; @@ -1473,11 +1527,22 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) } if (nfit_mem->bdw && nfit_mem->memdev_pmem) - flags |= NDD_ALIASING; + set_bit(NDD_ALIASING, &flags); + + /* collate flags across all memdevs for this dimm */ + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + struct acpi_nfit_memory_map *dimm_memdev; + + dimm_memdev = __to_nfit_memdev(nfit_mem); + if (dimm_memdev->device_handle + != nfit_memdev->memdev->device_handle) + continue; + dimm_memdev->flags |= nfit_memdev->memdev->flags; + } mem_flags = __to_nfit_memdev(nfit_mem)->flags; if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED) - flags |= NDD_UNARMED; + set_bit(NDD_UNARMED, &flags); rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); if (rc) @@ -1507,12 +1572,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) continue; - dev_info(acpi_desc->dev, "%s flags:%s%s%s%s\n", + dev_info(acpi_desc->dev, "%s flags:%s%s%s%s%s\n", nvdimm_name(nvdimm), mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? " save_fail" : "", mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? " restore_fail":"", mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? " flush_fail" : "", - mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : ""); + mem_flags & ACPI_NFIT_MEM_NOT_ARMED ? " not_armed" : "", + mem_flags & ACPI_NFIT_MEM_MAP_FAILED ? " map_fail" : ""); } @@ -1783,8 +1849,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, mmio_flush_range((void __force *) mmio->addr.aperture + offset, c); - memcpy_from_pmem(iobuf + copied, - mmio->addr.aperture + offset, c); + memcpy(iobuf + copied, mmio->addr.aperture + offset, c); } copied += c; @@ -2525,6 +2590,7 @@ static void acpi_nfit_scrub(struct work_struct *work) acpi_nfit_register_region(acpi_desc, nfit_spa); } } + acpi_desc->init_complete = 1; list_for_each_entry(nfit_spa, &acpi_desc->spas, list) acpi_nfit_async_scrub(acpi_desc, nfit_spa); @@ -2547,7 +2613,8 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) return rc; } - queue_work(nfit_wq, &acpi_desc->work); + if (!acpi_desc->cancel) + queue_work(nfit_wq, &acpi_desc->work); return 0; } @@ -2593,32 +2660,11 @@ static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc) return 0; } -static void acpi_nfit_destruct(void *data) +static void acpi_nfit_unregister(void *data) { struct acpi_nfit_desc *acpi_desc = data; - struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); - - /* - * Destruct under acpi_desc_lock so that nfit_handle_mce does not - * race teardown - */ - mutex_lock(&acpi_desc_lock); - acpi_desc->cancel = 1; - /* - * Bounce the nvdimm bus lock to make sure any in-flight - * acpi_nfit_ars_rescan() submissions have had a chance to - * either submit or see ->cancel set. - */ - device_lock(bus_dev); - device_unlock(bus_dev); - flush_workqueue(nfit_wq); - if (acpi_desc->scrub_count_state) - sysfs_put(acpi_desc->scrub_count_state); nvdimm_bus_unregister(acpi_desc->nvdimm_bus); - acpi_desc->nvdimm_bus = NULL; - list_del(&acpi_desc->list); - mutex_unlock(&acpi_desc_lock); } int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) @@ -2636,7 +2682,7 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz) if (!acpi_desc->nvdimm_bus) return -ENOMEM; - rc = devm_add_action_or_reset(dev, acpi_nfit_destruct, + rc = devm_add_action_or_reset(dev, acpi_nfit_unregister, acpi_desc); if (rc) return rc; @@ -2728,6 +2774,13 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) device_lock(dev); device_unlock(dev); + /* bounce the init_mutex to make init_complete valid */ + mutex_lock(&acpi_desc->init_mutex); + if (acpi_desc->cancel || acpi_desc->init_complete) { + mutex_unlock(&acpi_desc->init_mutex); + return 0; + } + /* * Scrub work could take 10s of seconds, userspace may give up so we * need to be interruptible while waiting. @@ -2735,6 +2788,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) INIT_WORK_ONSTACK(&flush.work, flush_probe); COMPLETION_INITIALIZER_ONSTACK(flush.cmp); queue_work(nfit_wq, &flush.work); + mutex_unlock(&acpi_desc->init_mutex); rc = wait_for_completion_interruptible(&flush.cmp); cancel_work_sync(&flush.work); @@ -2771,10 +2825,12 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc) if (work_busy(&acpi_desc->work)) return -EBUSY; - if (acpi_desc->cancel) + mutex_lock(&acpi_desc->init_mutex); + if (acpi_desc->cancel) { + mutex_unlock(&acpi_desc->init_mutex); return 0; + } - mutex_lock(&acpi_desc->init_mutex); list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { struct acpi_nfit_system_address *spa = nfit_spa->spa; @@ -2818,6 +2874,40 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev) } EXPORT_SYMBOL_GPL(acpi_nfit_desc_init); +static void acpi_nfit_put_table(void *table) +{ + acpi_put_table(table); +} + +void acpi_nfit_shutdown(void *data) +{ + struct acpi_nfit_desc *acpi_desc = data; + struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus); + + /* + * Destruct under acpi_desc_lock so that nfit_handle_mce does not + * race teardown + */ + mutex_lock(&acpi_desc_lock); + list_del(&acpi_desc->list); + mutex_unlock(&acpi_desc_lock); + + mutex_lock(&acpi_desc->init_mutex); + acpi_desc->cancel = 1; + mutex_unlock(&acpi_desc->init_mutex); + + /* + * Bounce the nvdimm bus lock to make sure any in-flight + * acpi_nfit_ars_rescan() submissions have had a chance to + * either submit or see ->cancel set. + */ + device_lock(bus_dev); + device_unlock(bus_dev); + + flush_workqueue(nfit_wq); +} +EXPORT_SYMBOL_GPL(acpi_nfit_shutdown); + static int acpi_nfit_add(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -2834,6 +2924,10 @@ static int acpi_nfit_add(struct acpi_device *adev) dev_dbg(dev, "failed to find NFIT at startup\n"); return 0; } + + rc = devm_add_action_or_reset(dev, acpi_nfit_put_table, tbl); + if (rc) + return rc; sz = tbl->length; acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); @@ -2861,12 +2955,15 @@ static int acpi_nfit_add(struct acpi_device *adev) rc = acpi_nfit_init(acpi_desc, (void *) tbl + sizeof(struct acpi_table_nfit), sz - sizeof(struct acpi_table_nfit)); - return rc; + + if (rc) + return rc; + return devm_add_action_or_reset(dev, acpi_nfit_shutdown, acpi_desc); } static int acpi_nfit_remove(struct acpi_device *adev) { - /* see acpi_nfit_destruct */ + /* see acpi_nfit_unregister */ return 0; } diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index fc29c2e9832e..58fb7d68e04a 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -37,7 +37,7 @@ #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ - | ACPI_NFIT_MEM_NOT_ARMED) + | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) enum nfit_uuids { /* for simplicity alias the uuid index with the family id */ @@ -163,6 +163,7 @@ struct acpi_nfit_desc { unsigned int scrub_count; unsigned int scrub_mode; unsigned int cancel:1; + unsigned int init_complete:1; unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, @@ -238,6 +239,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc( const u8 *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); +void acpi_nfit_shutdown(void *data); void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); void __acpi_nvdimm_notify(struct device *dev, u32 event); int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index d545abbd5378..8ddc98279c8f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -323,6 +323,7 @@ config BLK_DEV_SX8 config BLK_DEV_RAM tristate "RAM block device support" + select DAX if BLK_DEV_RAM_DAX ---help--- Saying Y here will allow you to use a portion of your RAM memory as a block device, so that you can make file systems on it, read and diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 4ec84d504780..57b574f2f66a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #ifdef CONFIG_BLK_DEV_RAM_DAX #include <linux/pfn_t.h> +#include <linux/dax.h> #endif #include <linux/uaccess.h> @@ -41,6 +42,9 @@ struct brd_device { struct request_queue *brd_queue; struct gendisk *brd_disk; +#ifdef CONFIG_BLK_DEV_RAM_DAX + struct dax_device *dax_dev; +#endif struct list_head brd_list; /* @@ -326,30 +330,38 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, } #ifdef CONFIG_BLK_DEV_RAM_DAX -static long brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; if (!brd) return -ENODEV; - page = brd_insert_page(brd, sector); + page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); if (!page) return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn_t(page); - return PAGE_SIZE; + return 1; } -#else -#define brd_direct_access NULL + +static long brd_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct brd_device *brd = dax_get_private(dax_dev); + + return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); +} + +static const struct dax_operations brd_dax_ops = { + .direct_access = brd_dax_direct_access, +}; #endif static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .rw_page = brd_rw_page, - .direct_access = brd_direct_access, }; /* @@ -415,9 +427,6 @@ static struct brd_device *brd_alloc(int i) * is harmless) */ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); -#ifdef CONFIG_BLK_DEV_RAM_DAX - queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); -#endif disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; @@ -430,8 +439,21 @@ static struct brd_device *brd_alloc(int i) sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); +#ifdef CONFIG_BLK_DEV_RAM_DAX + queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); + brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops); + if (!brd->dax_dev) + goto out_free_inode; +#endif + + return brd; +#ifdef CONFIG_BLK_DEV_RAM_DAX +out_free_inode: + kill_dax(brd->dax_dev); + put_dax(brd->dax_dev); +#endif out_free_queue: blk_cleanup_queue(brd->brd_queue); out_free_dev: @@ -471,6 +493,10 @@ out: static void brd_del_one(struct brd_device *brd) { list_del(&brd->brd_list); +#ifdef CONFIG_BLK_DEV_RAM_DAX + kill_dax(brd->dax_dev); + put_dax(brd->dax_dev); +#endif del_gendisk(brd->brd_disk); brd_free(brd); } diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 9e95bf94eb13..b7053eafd88e 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -1,8 +1,13 @@ -menuconfig DEV_DAX +menuconfig DAX tristate "DAX: direct access to differentiated memory" + select SRCU default m if NVDIMM_DAX + +if DAX + +config DEV_DAX + tristate "Device DAX: direct access mapping device" depends on TRANSPARENT_HUGEPAGE - select SRCU help Support raw access to differentiated (persistence, bandwidth, latency...) memory via an mmap(2) capable character @@ -11,7 +16,6 @@ menuconfig DEV_DAX baseline memory pool. Mappings of a /dev/daxX.Y device impose restrictions that make the mapping behavior deterministic. -if DEV_DAX config DEV_DAX_PMEM tristate "PMEM DAX: direct access to persistent memory" diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 27c54e38478a..dc7422530462 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -1,4 +1,7 @@ -obj-$(CONFIG_DEV_DAX) += dax.o +obj-$(CONFIG_DAX) += dax.o +obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o +dax-y := super.o dax_pmem-y := pmem.o +device_dax-y := device.o diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h new file mode 100644 index 000000000000..b6fc4f04636d --- /dev/null +++ b/drivers/dax/dax-private.h @@ -0,0 +1,57 @@ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __DAX_PRIVATE_H__ +#define __DAX_PRIVATE_H__ + +#include <linux/device.h> +#include <linux/cdev.h> + +/** + * struct dax_region - mapping infrastructure for dax devices + * @id: kernel-wide unique region for a memory range + * @base: linear address corresponding to @res + * @kref: to pin while other agents have a need to do lookups + * @dev: parent device backing this region + * @align: allocation and mapping alignment for child dax devices + * @res: physical address range of the region + * @pfn_flags: identify whether the pfns are paged back or not + */ +struct dax_region { + int id; + struct ida ida; + void *base; + struct kref kref; + struct device *dev; + unsigned int align; + struct resource res; + unsigned long pfn_flags; +}; + +/** + * struct dev_dax - instance data for a subdivision of a dax region + * @region - parent region + * @dax_dev - core dax functionality + * @dev - device core + * @id - child id in the region + * @num_resources - number of physical address extents in this device + * @res - array of physical address ranges + */ +struct dev_dax { + struct dax_region *region; + struct dax_device *dax_dev; + struct device dev; + int id; + int num_resources; + struct resource res[0]; +}; +#endif diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index ddd829ab58c0..f9e5feea742c 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. All rights reserved. + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -12,14 +12,7 @@ */ #ifndef __DAX_H__ #define __DAX_H__ -struct device; -struct dax_dev; -struct resource; -struct dax_region; -void dax_region_put(struct dax_region *dax_region); -struct dax_region *alloc_dax_region(struct device *parent, - int region_id, struct resource *res, unsigned int align, - void *addr, unsigned long flags); -struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, - struct resource *res, int count); +struct dax_device; +struct dax_device *inode_dax(struct inode *inode); +struct inode *dax_inode(struct dax_device *dax_dev); #endif /* __DAX_H__ */ diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h new file mode 100644 index 000000000000..fdcd9769ffde --- /dev/null +++ b/drivers/dax/device-dax.h @@ -0,0 +1,25 @@ +/* + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __DEVICE_DAX_H__ +#define __DEVICE_DAX_H__ +struct device; +struct dev_dax; +struct resource; +struct dax_region; +void dax_region_put(struct dax_region *dax_region); +struct dax_region *alloc_dax_region(struct device *parent, + int region_id, struct resource *res, unsigned int align, + void *addr, unsigned long flags); +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, + struct resource *res, int count); +#endif /* __DEVICE_DAX_H__ */ diff --git a/drivers/dax/dax.c b/drivers/dax/device.c index 19795eb35579..006e657dfcb9 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/device.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. All rights reserved. + * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -13,100 +13,38 @@ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/device.h> -#include <linux/magic.h> -#include <linux/mount.h> #include <linux/pfn_t.h> -#include <linux/hash.h> #include <linux/cdev.h> #include <linux/slab.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "dax-private.h" #include "dax.h" -static dev_t dax_devt; -DEFINE_STATIC_SRCU(dax_srcu); static struct class *dax_class; -static DEFINE_IDA(dax_minor_ida); -static int nr_dax = CONFIG_NR_DEV_DAX; -module_param(nr_dax, int, S_IRUGO); -static struct vfsmount *dax_mnt; -static struct kmem_cache *dax_cache __read_mostly; -static struct super_block *dax_superblock __read_mostly; -MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); - -/** - * struct dax_region - mapping infrastructure for dax devices - * @id: kernel-wide unique region for a memory range - * @base: linear address corresponding to @res - * @kref: to pin while other agents have a need to do lookups - * @dev: parent device backing this region - * @align: allocation and mapping alignment for child dax devices - * @res: physical address range of the region - * @pfn_flags: identify whether the pfns are paged back or not - */ -struct dax_region { - int id; - struct ida ida; - void *base; - struct kref kref; - struct device *dev; - unsigned int align; - struct resource res; - unsigned long pfn_flags; -}; -/** - * struct dax_dev - subdivision of a dax region - * @region - parent region - * @dev - device backing the character device - * @cdev - core chardev data - * @alive - !alive + srcu grace period == no new mappings can be established - * @id - child id in the region - * @num_resources - number of physical address extents in this device - * @res - array of physical address ranges +/* + * Rely on the fact that drvdata is set before the attributes are + * registered, and that the attributes are unregistered before drvdata + * is cleared to assume that drvdata is always valid. */ -struct dax_dev { - struct dax_region *region; - struct inode *inode; - struct device dev; - struct cdev cdev; - bool alive; - int id; - int num_resources; - struct resource res[0]; -}; - static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%d\n", dax_region->id); - device_unlock(dev); + struct dax_region *dax_region = dev_get_drvdata(dev); - return rc; + return sprintf(buf, "%d\n", dax_region->id); } static DEVICE_ATTR_RO(id); static ssize_t region_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); - device_unlock(dev); + struct dax_region *dax_region = dev_get_drvdata(dev); - return rc; + return sprintf(buf, "%llu\n", (unsigned long long) + resource_size(&dax_region->res)); } static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); @@ -114,16 +52,9 @@ static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_region *dax_region; - ssize_t rc = -ENXIO; - - device_lock(dev); - dax_region = dev_get_drvdata(dev); - if (dax_region) - rc = sprintf(buf, "%u\n", dax_region->align); - device_unlock(dev); + struct dax_region *dax_region = dev_get_drvdata(dev); - return rc; + return sprintf(buf, "%u\n", dax_region->align); } static DEVICE_ATTR_RO(align); @@ -144,117 +75,6 @@ static const struct attribute_group *dax_region_attribute_groups[] = { NULL, }; -static struct inode *dax_alloc_inode(struct super_block *sb) -{ - return kmem_cache_alloc(dax_cache, GFP_KERNEL); -} - -static void dax_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - - kmem_cache_free(dax_cache, inode); -} - -static void dax_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, dax_i_callback); -} - -static const struct super_operations dax_sops = { - .statfs = simple_statfs, - .alloc_inode = dax_alloc_inode, - .destroy_inode = dax_destroy_inode, - .drop_inode = generic_delete_inode, -}; - -static struct dentry *dax_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); -} - -static struct file_system_type dax_type = { - .name = "dax", - .mount = dax_mount, - .kill_sb = kill_anon_super, -}; - -static int dax_test(struct inode *inode, void *data) -{ - return inode->i_cdev == data; -} - -static int dax_set(struct inode *inode, void *data) -{ - inode->i_cdev = data; - return 0; -} - -static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) -{ - struct inode *inode; - - inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), - dax_test, dax_set, cdev); - - if (!inode) - return NULL; - - if (inode->i_state & I_NEW) { - inode->i_mode = S_IFCHR; - inode->i_flags = S_DAX; - inode->i_rdev = devt; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - unlock_new_inode(inode); - } - return inode; -} - -static void init_once(void *inode) -{ - inode_init_once(inode); -} - -static int dax_inode_init(void) -{ - int rc; - - dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), - init_once); - if (!dax_cache) - return -ENOMEM; - - rc = register_filesystem(&dax_type); - if (rc) - goto err_register_fs; - - dax_mnt = kern_mount(&dax_type); - if (IS_ERR(dax_mnt)) { - rc = PTR_ERR(dax_mnt); - goto err_mount; - } - dax_superblock = dax_mnt->mnt_sb; - - return 0; - - err_mount: - unregister_filesystem(&dax_type); - err_register_fs: - kmem_cache_destroy(dax_cache); - - return rc; -} - -static void dax_inode_exit(void) -{ - kern_unmount(dax_mnt); - unregister_filesystem(&dax_type); - kmem_cache_destroy(dax_cache); -} - static void dax_region_free(struct kref *kref) { struct dax_region *dax_region; @@ -323,47 +143,47 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); -static struct dax_dev *to_dax_dev(struct device *dev) +static struct dev_dax *to_dev_dax(struct device *dev) { - return container_of(dev, struct dax_dev, dev); + return container_of(dev, struct dev_dax, dev); } static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = to_dax_dev(dev); + struct dev_dax *dev_dax = to_dev_dax(dev); unsigned long long size = 0; int i; - for (i = 0; i < dax_dev->num_resources; i++) - size += resource_size(&dax_dev->res[i]); + for (i = 0; i < dev_dax->num_resources; i++) + size += resource_size(&dev_dax->res[i]); return sprintf(buf, "%llu\n", size); } static DEVICE_ATTR_RO(size); -static struct attribute *dax_device_attributes[] = { +static struct attribute *dev_dax_attributes[] = { &dev_attr_size.attr, NULL, }; -static const struct attribute_group dax_device_attribute_group = { - .attrs = dax_device_attributes, +static const struct attribute_group dev_dax_attribute_group = { + .attrs = dev_dax_attributes, }; static const struct attribute_group *dax_attribute_groups[] = { - &dax_device_attribute_group, + &dev_dax_attribute_group, NULL, }; -static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, +static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { - struct dax_region *dax_region = dax_dev->region; - struct device *dev = &dax_dev->dev; + struct dax_region *dax_region = dev_dax->region; + struct device *dev = &dev_dax->dev; unsigned long mask; - if (!dax_dev->alive) + if (!dax_alive(dev_dax->dax_dev)) return -ENXIO; /* prevent private mappings from being established */ @@ -397,23 +217,24 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, return 0; } -static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, +/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ +__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size) { struct resource *res; phys_addr_t phys; int i; - for (i = 0; i < dax_dev->num_resources; i++) { - res = &dax_dev->res[i]; + for (i = 0; i < dev_dax->num_resources; i++) { + res = &dev_dax->res[i]; phys = pgoff * PAGE_SIZE + res->start; if (phys >= res->start && phys <= res->end) break; pgoff -= PHYS_PFN(resource_size(res)); } - if (i < dax_dev->num_resources) { - res = &dax_dev->res[i]; + if (i < dev_dax->num_resources) { + res = &dev_dax->res[i]; if (phys + size - 1 <= res->end) return phys; } @@ -421,28 +242,29 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, return -1; } -static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; pfn_t pfn; unsigned int fault_size = PAGE_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PAGE_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } if (fault_size != dax_region->align) return VM_FAULT_SIGBUS; - phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); + phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, vmf->pgoff); @@ -461,28 +283,29 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_NOPAGE; } -static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; unsigned int fault_size = PMD_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PMD_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } /* dax pmd mappings require pfn_t_devmap() */ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); return VM_FAULT_SIGBUS; } @@ -497,7 +320,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pmd_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); + phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); @@ -511,10 +334,10 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; - struct device *dev = &dax_dev->dev; + struct device *dev = &dev_dax->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -522,18 +345,19 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) unsigned int fault_size = PUD_SIZE; - if (check_vma(dax_dev, vmf->vma, __func__)) + if (check_vma(dev_dax, vmf->vma, __func__)) return VM_FAULT_SIGBUS; - dax_region = dax_dev->region; + dax_region = dev_dax->region; if (dax_region->align > PUD_SIZE) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", + __func__, dax_region->align, fault_size); return VM_FAULT_SIGBUS; } /* dax pud mappings require pfn_t_devmap() */ if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { - dev_dbg(dev, "%s: alignment > fault size\n", __func__); + dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); return VM_FAULT_SIGBUS; } @@ -548,7 +372,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pgoff = linear_page_index(vmf->vma, pud_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); + phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); @@ -561,65 +385,71 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) vmf->flags & FAULT_FLAG_WRITE); } #else -static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static int dax_dev_huge_fault(struct vm_fault *vmf, +static int dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int rc, id; struct file *filp = vmf->vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; + struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end); + vmf->vma->vm_start, vmf->vma->vm_end, pe_size); - id = srcu_read_lock(&dax_srcu); + id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - rc = __dax_dev_pte_fault(dax_dev, vmf); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - rc = __dax_dev_pmd_fault(dax_dev, vmf); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - rc = __dax_dev_pud_fault(dax_dev, vmf); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: - return VM_FAULT_FALLBACK; + rc = VM_FAULT_SIGBUS; } - srcu_read_unlock(&dax_srcu, id); + dax_read_unlock(id); return rc; } -static int dax_dev_fault(struct vm_fault *vmf) +static int dev_dax_fault(struct vm_fault *vmf) { - return dax_dev_huge_fault(vmf, PE_SIZE_PTE); + return dev_dax_huge_fault(vmf, PE_SIZE_PTE); } -static const struct vm_operations_struct dax_dev_vm_ops = { - .fault = dax_dev_fault, - .huge_fault = dax_dev_huge_fault, +static const struct vm_operations_struct dax_vm_ops = { + .fault = dev_dax_fault, + .huge_fault = dev_dax_huge_fault, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { - struct dax_dev *dax_dev = filp->private_data; - int rc; + struct dev_dax *dev_dax = filp->private_data; + int rc, id; - dev_dbg(&dax_dev->dev, "%s\n", __func__); + dev_dbg(&dev_dax->dev, "%s\n", __func__); - rc = check_vma(dax_dev, vma, __func__); + /* + * We lock to check dax_dev liveness and will re-check at + * fault time. + */ + id = dax_read_lock(); + rc = check_vma(dev_dax, vma, __func__); + dax_read_unlock(id); if (rc) return rc; - vma->vm_ops = &dax_dev_vm_ops; + vma->vm_ops = &dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; } @@ -630,13 +460,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, unsigned long flags) { unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dev_dax *dev_dax = filp ? filp->private_data : NULL; struct dax_region *dax_region; - if (!dax_dev || addr) + if (!dev_dax || addr) goto out; - dax_region = dax_dev->region; + dax_region = dev_dax->region; align = dax_region->align; off = pgoff << PAGE_SHIFT; off_end = off + len; @@ -661,14 +491,15 @@ static unsigned long dax_get_unmapped_area(struct file *filp, static int dax_open(struct inode *inode, struct file *filp) { - struct dax_dev *dax_dev; + struct dax_device *dax_dev = inode_dax(inode); + struct inode *__dax_inode = dax_inode(dax_dev); + struct dev_dax *dev_dax = dax_get_private(dax_dev); - dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); - dev_dbg(&dax_dev->dev, "%s\n", __func__); - inode->i_mapping = dax_dev->inode->i_mapping; - inode->i_mapping->host = dax_dev->inode; + dev_dbg(&dev_dax->dev, "%s\n", __func__); + inode->i_mapping = __dax_inode->i_mapping; + inode->i_mapping->host = __dax_inode; filp->f_mapping = inode->i_mapping; - filp->private_data = dax_dev; + filp->private_data = dev_dax; inode->i_flags = S_DAX; return 0; @@ -676,9 +507,9 @@ static int dax_open(struct inode *inode, struct file *filp) static int dax_release(struct inode *inode, struct file *filp) { - struct dax_dev *dax_dev = filp->private_data; + struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dax_dev->dev, "%s\n", __func__); + dev_dbg(&dev_dax->dev, "%s\n", __func__); return 0; } @@ -691,55 +522,54 @@ static const struct file_operations dax_fops = { .mmap = dax_mmap, }; -static void dax_dev_release(struct device *dev) +static void dev_dax_release(struct device *dev) { - struct dax_dev *dax_dev = to_dax_dev(dev); - struct dax_region *dax_region = dax_dev->region; + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_region *dax_region = dev_dax->region; + struct dax_device *dax_dev = dev_dax->dax_dev; - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + ida_simple_remove(&dax_region->ida, dev_dax->id); dax_region_put(dax_region); - iput(dax_dev->inode); - kfree(dax_dev); + put_dax(dax_dev); + kfree(dev_dax); } -static void kill_dax_dev(struct dax_dev *dax_dev) +static void kill_dev_dax(struct dev_dax *dev_dax) { - /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is - * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_srcu() has started will abort - * upon seeing dax_dev->alive == false. - */ - dax_dev->alive = false; - synchronize_srcu(&dax_srcu); - unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + + kill_dax(dax_dev); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); } -static void unregister_dax_dev(void *dev) +static void unregister_dev_dax(void *dev) { - struct dax_dev *dax_dev = to_dax_dev(dev); + struct dev_dax *dev_dax = to_dev_dax(dev); + struct dax_device *dax_dev = dev_dax->dax_dev; + struct inode *inode = dax_inode(dax_dev); + struct cdev *cdev = inode->i_cdev; dev_dbg(dev, "%s\n", __func__); - kill_dax_dev(dax_dev); - cdev_device_del(&dax_dev->cdev, dev); + kill_dev_dax(dev_dax); + cdev_device_del(cdev, dev); put_device(dev); } -struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, +struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, struct resource *res, int count) { struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; - int rc = 0, minor, i; + struct dax_device *dax_dev; + struct dev_dax *dev_dax; + struct inode *inode; struct device *dev; struct cdev *cdev; - dev_t dev_t; + int rc = 0, i; - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) + dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); + if (!dev_dax) return ERR_PTR(-ENOMEM); for (i = 0; i < count; i++) { @@ -749,110 +579,79 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, rc = -EINVAL; break; } - dax_dev->res[i].start = res[i].start; - dax_dev->res[i].end = res[i].end; + dev_dax->res[i].start = res[i].start; + dev_dax->res[i].end = res[i].end; } if (i < count) goto err_id; - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; + dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dev_dax->id < 0) { + rc = dev_dax->id; goto err_id; } - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(MAJOR(dax_devt), minor); - dev = &dax_dev->dev; - dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); - if (!dax_dev->inode) { - rc = -ENOMEM; - goto err_inode; - } + /* + * No 'host' or dax_operations since there is no access to this + * device outside of mmap of the resulting character device. + */ + dax_dev = alloc_dax(dev_dax, NULL, NULL); + if (!dax_dev) + goto err_dax; /* from here on we're committed to teardown via dax_dev_release() */ + dev = &dev_dax->dev; device_initialize(dev); - cdev = &dax_dev->cdev; + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; cdev_init(cdev, &dax_fops); cdev->owner = parent->driver->owner; - dax_dev->num_resources = count; - dax_dev->alive = true; - dax_dev->region = dax_region; + dev_dax->num_resources = count; + dev_dax->dax_dev = dax_dev; + dev_dax->region = dax_region; kref_get(&dax_region->kref); - dev->devt = dev_t; + dev->devt = inode->i_rdev; dev->class = dax_class; dev->parent = parent; dev->groups = dax_attribute_groups; - dev->release = dax_dev_release; - dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + dev->release = dev_dax_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); rc = cdev_device_add(cdev, dev); if (rc) { - kill_dax_dev(dax_dev); + kill_dev_dax(dev_dax); put_device(dev); return ERR_PTR(rc); } - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); if (rc) return ERR_PTR(rc); - return dax_dev; + return dev_dax; - err_inode: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); + err_dax: + ida_simple_remove(&dax_region->ida, dev_dax->id); err_id: - kfree(dax_dev); + kfree(dev_dax); return ERR_PTR(rc); } -EXPORT_SYMBOL_GPL(devm_create_dax_dev); +EXPORT_SYMBOL_GPL(devm_create_dev_dax); static int __init dax_init(void) { - int rc; - - rc = dax_inode_init(); - if (rc) - return rc; - - nr_dax = max(nr_dax, 256); - rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); - if (rc) - goto err_chrdev; - dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) { - rc = PTR_ERR(dax_class); - goto err_class; - } - - return 0; - - err_class: - unregister_chrdev_region(dax_devt, nr_dax); - err_chrdev: - dax_inode_exit(); - return rc; + return PTR_ERR_OR_ZERO(dax_class); } static void __exit dax_exit(void) { class_destroy(dax_class); - unregister_chrdev_region(dax_devt, nr_dax); - ida_destroy(&dax_minor_ida); - dax_inode_exit(); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index cb0d742fa23f..9f2a0b4fd801 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -16,7 +16,7 @@ #include <linux/pfn_t.h> #include "../nvdimm/pfn.h" #include "../nvdimm/nd.h" -#include "dax.h" +#include "device-dax.h" struct dax_pmem { struct device *dev; @@ -61,8 +61,8 @@ static int dax_pmem_probe(struct device *dev) int rc; void *addr; struct resource res; - struct dax_dev *dax_dev; struct nd_pfn_sb *pfn_sb; + struct dev_dax *dev_dax; struct dax_pmem *dax_pmem; struct nd_region *nd_region; struct nd_namespace_io *nsio; @@ -130,12 +130,12 @@ static int dax_pmem_probe(struct device *dev) return -ENOMEM; /* TODO: support for subdividing a dax region... */ - dax_dev = devm_create_dax_dev(dax_region, &res, 1); + dev_dax = devm_create_dev_dax(dax_region, &res, 1); - /* child dax_dev instances now own the lifetime of the dax_region */ + /* child dev_dax instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return PTR_ERR_OR_ZERO(dax_dev); + return PTR_ERR_OR_ZERO(dev_dax); } static struct nd_device_driver dax_pmem_driver = { diff --git a/drivers/dax/super.c b/drivers/dax/super.c new file mode 100644 index 000000000000..465dcd7317d5 --- /dev/null +++ b/drivers/dax/super.c @@ -0,0 +1,425 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/magic.h> +#include <linux/cdev.h> +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/dax.h> +#include <linux/fs.h> + +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +MODULE_PARM_DESC(nr_dax, "max number of dax device instances"); + +static dev_t dax_devt; +DEFINE_STATIC_SRCU(dax_srcu); +static struct vfsmount *dax_mnt; +static DEFINE_IDA(dax_minor_ida); +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; + +#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) +static struct hlist_head dax_host_list[DAX_HASH_SIZE]; +static DEFINE_SPINLOCK(dax_host_lock); + +int dax_read_lock(void) +{ + return srcu_read_lock(&dax_srcu); +} +EXPORT_SYMBOL_GPL(dax_read_lock); + +void dax_read_unlock(int id) +{ + srcu_read_unlock(&dax_srcu, id); +} +EXPORT_SYMBOL_GPL(dax_read_unlock); + +/** + * struct dax_device - anchor object for dax services + * @inode: core vfs + * @cdev: optional character interface for "device dax" + * @host: optional name for lookups where the device path is not available + * @private: dax driver private data + * @alive: !alive + rcu grace period == no new operations / mappings + */ +struct dax_device { + struct hlist_node list; + struct inode inode; + struct cdev cdev; + const char *host; + void *private; + bool alive; + const struct dax_operations *ops; +}; + +/** + * dax_direct_access() - translate a device pgoff to an absolute pfn + * @dax_dev: a dax_device instance representing the logical memory range + * @pgoff: offset in pages from the start of the device to translate + * @nr_pages: number of consecutive pages caller can handle relative to @pfn + * @kaddr: output parameter that returns a virtual address mapping of pfn + * @pfn: output parameter that returns an absolute pfn translation of @pgoff + * + * Return: negative errno if an error occurs, otherwise the number of + * pages accessible at the device relative @pgoff. + */ +long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn) +{ + long avail; + + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + + if (!dax_dev) + return -EOPNOTSUPP; + + if (!dax_alive(dax_dev)) + return -ENXIO; + + if (nr_pages < 0) + return nr_pages; + + avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, + kaddr, pfn); + if (!avail) + return -ERANGE; + return min(avail, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_direct_access); + +bool dax_alive(struct dax_device *dax_dev) +{ + lockdep_assert_held(&dax_srcu); + return dax_dev->alive; +} +EXPORT_SYMBOL_GPL(dax_alive); + +static int dax_host_hash(const char *host) +{ + return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; +} + +/* + * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring + * that any fault handlers or operations that might have seen + * dax_alive(), have completed. Any operations that start after + * synchronize_srcu() has run will abort upon seeing !dax_alive(). + */ +void kill_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + + dax_dev->alive = false; + + synchronize_srcu(&dax_srcu); + + spin_lock(&dax_host_lock); + hlist_del_init(&dax_dev->list); + spin_unlock(&dax_host_lock); + + dax_dev->private = NULL; +} +EXPORT_SYMBOL_GPL(kill_dax); + +static struct inode *dax_alloc_inode(struct super_block *sb) +{ + struct dax_device *dax_dev; + + dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); + return &dax_dev->inode; +} + +static struct dax_device *to_dax_dev(struct inode *inode) +{ + return container_of(inode, struct dax_device, inode); +} + +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct dax_device *dax_dev = to_dax_dev(inode); + + kfree(dax_dev->host); + dax_dev->host = NULL; + ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); + kmem_cache_free(dax_cache, dax_dev); +} + +static void dax_destroy_inode(struct inode *inode) +{ + struct dax_device *dax_dev = to_dax_dev(inode); + + WARN_ONCE(dax_dev->alive, + "kill_dax() must be called before final iput()\n"); + call_rcu(&inode->i_rcu, dax_i_callback); +} + +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} + +static struct file_system_type dax_fs_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + return inode->i_rdev == devt; +} + +static int dax_set(struct inode *inode, void *data) +{ + dev_t devt = *(dev_t *) data; + + inode->i_rdev = devt; + return 0; +} + +static struct dax_device *dax_dev_get(dev_t devt) +{ + struct dax_device *dax_dev; + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, &devt); + + if (!inode) + return NULL; + + dax_dev = to_dax_dev(inode); + if (inode->i_state & I_NEW) { + dax_dev->alive = true; + inode->i_cdev = &dax_dev->cdev; + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + + return dax_dev; +} + +static void dax_add_host(struct dax_device *dax_dev, const char *host) +{ + int hash; + + /* + * Unconditionally init dax_dev since it's coming from a + * non-zeroed slab cache + */ + INIT_HLIST_NODE(&dax_dev->list); + dax_dev->host = host; + if (!host) + return; + + hash = dax_host_hash(host); + spin_lock(&dax_host_lock); + hlist_add_head(&dax_dev->list, &dax_host_list[hash]); + spin_unlock(&dax_host_lock); +} + +struct dax_device *alloc_dax(void *private, const char *__host, + const struct dax_operations *ops) +{ + struct dax_device *dax_dev; + const char *host; + dev_t devt; + int minor; + + host = kstrdup(__host, GFP_KERNEL); + if (__host && !host) + return NULL; + + minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL); + if (minor < 0) + goto err_minor; + + devt = MKDEV(MAJOR(dax_devt), minor); + dax_dev = dax_dev_get(devt); + if (!dax_dev) + goto err_dev; + + dax_add_host(dax_dev, host); + dax_dev->ops = ops; + dax_dev->private = private; + return dax_dev; + + err_dev: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + kfree(host); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_dax); + +void put_dax(struct dax_device *dax_dev) +{ + if (!dax_dev) + return; + iput(&dax_dev->inode); +} +EXPORT_SYMBOL_GPL(put_dax); + +/** + * dax_get_by_host() - temporary lookup mechanism for filesystem-dax + * @host: alternate name for the device registered by a dax driver + */ +struct dax_device *dax_get_by_host(const char *host) +{ + struct dax_device *dax_dev, *found = NULL; + int hash, id; + + if (!host) + return NULL; + + hash = dax_host_hash(host); + + id = dax_read_lock(); + spin_lock(&dax_host_lock); + hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { + if (!dax_alive(dax_dev) + || strcmp(host, dax_dev->host) != 0) + continue; + + if (igrab(&dax_dev->inode)) + found = dax_dev; + break; + } + spin_unlock(&dax_host_lock); + dax_read_unlock(id); + + return found; +} +EXPORT_SYMBOL_GPL(dax_get_by_host); + +/** + * inode_dax: convert a public inode into its dax_dev + * @inode: An inode with i_cdev pointing to a dax_dev + * + * Note this is not equivalent to to_dax_dev() which is for private + * internal use where we know the inode filesystem type == dax_fs_type. + */ +struct dax_device *inode_dax(struct inode *inode) +{ + struct cdev *cdev = inode->i_cdev; + + return container_of(cdev, struct dax_device, cdev); +} +EXPORT_SYMBOL_GPL(inode_dax); + +struct inode *dax_inode(struct dax_device *dax_dev) +{ + return &dax_dev->inode; +} +EXPORT_SYMBOL_GPL(dax_inode); + +void *dax_get_private(struct dax_device *dax_dev) +{ + return dax_dev->private; +} +EXPORT_SYMBOL_GPL(dax_get_private); + +static void init_once(void *_dax_dev) +{ + struct dax_device *dax_dev = _dax_dev; + struct inode *inode = &dax_dev->inode; + + inode_init_once(inode); +} + +static int __dax_fs_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_fs_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_fs_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_fs_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; +} + +static void __dax_fs_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_fs_type); + kmem_cache_destroy(dax_cache); +} + +static int __init dax_fs_init(void) +{ + int rc; + + rc = __dax_fs_init(); + if (rc) + return rc; + + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + __dax_fs_exit(); + return rc; +} + +static void __exit dax_fs_exit(void) +{ + unregister_chrdev_region(dax_devt, nr_dax); + ida_destroy(&dax_minor_ida); + __dax_fs_exit(); +} + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +subsys_initcall(dax_fs_init); +module_exit(dax_fs_exit); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 7468a22f9d10..349ff8813401 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -200,6 +200,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" select BLK_DEV_DM_BUILTIN + select DAX ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 97db4d11c05a..52ca8d059e82 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -58,6 +58,7 @@ struct mapped_device { struct target_type *immutable_target_type; struct gendisk *disk; + struct dax_device *dax_dev; char name[16]; void *interface_ptr; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index a5120961632a..7d42a9d9f406 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -9,6 +9,7 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/device-mapper.h> @@ -142,22 +143,20 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } -static long linear_direct_access(struct dm_target *ti, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { + long ret; struct linear_c *lc = ti->private; struct block_device *bdev = lc->dev->bdev; - struct blk_dax_ctl dax = { - .sector = linear_map_sector(ti, sector), - .size = size, - }; - long ret; - - ret = bdev_direct_access(bdev, &dax); - *kaddr = dax.addr; - *pfn = dax.pfn; - - return ret; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); + if (ret) + return ret; + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); } static struct target_type linear_target = { @@ -171,7 +170,7 @@ static struct target_type linear_target = { .status = linear_status, .prepare_ioctl = linear_prepare_ioctl, .iterate_devices = linear_iterate_devices, - .direct_access = linear_direct_access, + .direct_access = linear_dax_direct_access, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index c65feeada864..e152d9817c81 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2302,8 +2302,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio) return do_origin(o->dev, bio); } -static long origin_direct_access(struct dm_target *ti, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { DMWARN("device does not support dax."); return -EIO; @@ -2368,7 +2368,7 @@ static struct target_type origin_target = { .postsuspend = origin_postsuspend, .status = origin_status, .iterate_devices = origin_iterate_devices, - .direct_access = origin_direct_access, + .direct_access = origin_dax_direct_access, }; static struct target_type snapshot_target = { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4b50ae115c6d..75152482f3ad 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/log2.h> @@ -310,27 +311,25 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } -static long stripe_direct_access(struct dm_target *ti, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; struct stripe_c *sc = ti->private; - uint32_t stripe; + struct dax_device *dax_dev; struct block_device *bdev; - struct blk_dax_ctl dax = { - .size = size, - }; + uint32_t stripe; long ret; - stripe_map_sector(sc, sector, &stripe, &dax.sector); - - dax.sector += sc->stripe[stripe].physical_start; + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; bdev = sc->stripe[stripe].dev->bdev; - ret = bdev_direct_access(bdev, &dax); - *kaddr = dax.addr; - *pfn = dax.pfn; - - return ret; + ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff); + if (ret) + return ret; + return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); } /* @@ -451,7 +450,7 @@ static struct target_type stripe_target = { .status = stripe_status, .iterate_devices = stripe_iterate_devices, .io_hints = stripe_io_hints, - .direct_access = stripe_direct_access, + .direct_access = stripe_dax_direct_access, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 6264ff00dcf0..b242b750542f 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -142,8 +142,8 @@ static void io_err_release_clone_rq(struct request *clone) { } -static long io_err_direct_access(struct dm_target *ti, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { return -EIO; } @@ -157,7 +157,7 @@ static struct target_type error_target = { .map = io_err_map, .clone_and_map_rq = io_err_clone_and_map_rq, .release_clone_rq = io_err_release_clone_rq, - .direct_access = io_err_direct_access, + .direct_access = io_err_dax_direct_access, }; int __init dm_target_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 268edf402bbb..6ef9500226c0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -16,6 +16,7 @@ #include <linux/blkpg.h> #include <linux/bio.h> #include <linux/mempool.h> +#include <linux/dax.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/hdreg.h> @@ -629,6 +630,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; + td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); return 0; } @@ -642,7 +644,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + put_dax(td->dm_dev.dax_dev); td->dm_dev.bdev = NULL; + td->dm_dev.dax_dev = NULL; } static struct table_device *find_table_device(struct list_head *l, dev_t dev, @@ -920,31 +924,49 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); -static long dm_blk_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, + sector_t sector, int *srcu_idx) { - struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *map; struct dm_target *ti; - int srcu_idx; - long len, ret = -EIO; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table(md, srcu_idx); if (!map) - goto out; + return NULL; ti = dm_table_find_target(map, sector); if (!dm_target_is_valid(ti)) - goto out; + return NULL; + + return ti; +} + +static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + long len, ret = -EIO; + int srcu_idx; - len = max_io_len(sector, ti) << SECTOR_SHIFT; - size = min(len, size); + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti) + goto out; + if (!ti->type->direct_access) + goto out; + len = max_io_len(sector, ti) / PAGE_SECTORS; + if (len < 1) + goto out; + nr_pages = min(len, nr_pages); if (ti->type->direct_access) - ret = ti->type->direct_access(ti, sector, kaddr, pfn, size); -out: + ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); + + out: dm_put_live_table(md, srcu_idx); - return min(ret, size); + + return ret; } /* @@ -1471,6 +1493,7 @@ static int next_free_minor(int *minor) } static const struct block_device_operations dm_blk_dops; +static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); @@ -1517,6 +1540,12 @@ static void cleanup_mapped_device(struct mapped_device *md) if (md->bs) bioset_free(md->bs); + if (md->dax_dev) { + kill_dax(md->dax_dev); + put_dax(md->dax_dev); + md->dax_dev = NULL; + } + if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; @@ -1544,6 +1573,7 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); + struct dax_device *dax_dev; struct mapped_device *md; void *old_md; @@ -1608,6 +1638,12 @@ static struct mapped_device *alloc_dev(int minor) md->disk->queue = md->queue; md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); + + dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!dax_dev) + goto bad; + md->dax_dev = dax_dev; + add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); @@ -2816,12 +2852,15 @@ static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, - .direct_access = dm_blk_direct_access, .getgeo = dm_blk_getgeo, .pr_ops = &dm_pr_ops, .owner = THIS_MODULE }; +static const struct dax_operations dm_dax_ops = { + .direct_access = dm_dax_direct_access, +}; + /* * module hooks */ diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 59e750183b7f..5bdd499b5f4f 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -20,6 +20,7 @@ if LIBNVDIMM config BLK_DEV_PMEM tristate "PMEM: Persistent memory block device support" default LIBNVDIMM + select DAX select ND_BTT if BTT select ND_PFN if NVDIMM_PFN help diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 97dd2925ed6e..4b76af2b8715 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -314,7 +314,7 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns) if (rc < 0) { struct nd_btt *nd_btt = to_nd_btt(btt_dev); - __nd_detach_ndns(btt_dev, &nd_btt->ndns); + nd_detach_ndns(btt_dev, &nd_btt->ndns); put_device(btt_dev); } diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 351bac8f6503..e9361bffe5ee 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -27,6 +27,7 @@ #include <linux/nd.h> #include "nd-core.h" #include "nd.h" +#include "pfn.h" int nvdimm_major; static int nvdimm_bus_major; @@ -171,6 +172,57 @@ void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event) } EXPORT_SYMBOL_GPL(nvdimm_region_notify); +struct clear_badblocks_context { + resource_size_t phys, cleared; +}; + +static int nvdimm_clear_badblocks_region(struct device *dev, void *data) +{ + struct clear_badblocks_context *ctx = data; + struct nd_region *nd_region; + resource_size_t ndr_end; + sector_t sector; + + /* make sure device is a region */ + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1; + + /* make sure we are in the region */ + if (ctx->phys < nd_region->ndr_start + || (ctx->phys + ctx->cleared) > ndr_end) + return 0; + + sector = (ctx->phys - nd_region->ndr_start) / 512; + badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512); + + return 0; +} + +static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus, + phys_addr_t phys, u64 cleared) +{ + struct clear_badblocks_context ctx = { + .phys = phys, + .cleared = cleared, + }; + + device_for_each_child(&nvdimm_bus->dev, &ctx, + nvdimm_clear_badblocks_region); +} + +static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus, + phys_addr_t phys, u64 cleared) +{ + if (cleared > 0) + nvdimm_forget_poison(nvdimm_bus, phys, cleared); + + if (cleared > 0 && cleared / 512) + nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); +} + long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, unsigned int len) { @@ -218,7 +270,8 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, if (cmd_rc < 0) return cmd_rc; - nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); + nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared); + return clear_err.cleared; } EXPORT_SYMBOL_GPL(nvdimm_clear_poison); @@ -286,6 +339,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); + spin_lock_init(&nvdimm_bus->poison_lock); if (nvdimm_bus->id < 0) { kfree(nvdimm_bus); return NULL; @@ -354,9 +408,9 @@ static int nd_bus_remove(struct device *dev) nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); - nvdimm_bus_lock(&nvdimm_bus->dev); + spin_lock(&nvdimm_bus->poison_lock); free_poison_list(&nvdimm_bus->poison_list); - nvdimm_bus_unlock(&nvdimm_bus->dev); + spin_unlock(&nvdimm_bus->poison_lock); nvdimm_bus_destroy_ndctl(nvdimm_bus); @@ -769,16 +823,55 @@ void wait_nvdimm_bus_probe_idle(struct device *dev) } while (true); } -static int pmem_active(struct device *dev, void *data) +static int nd_pmem_forget_poison_check(struct device *dev, void *data) { - if (is_nd_pmem(dev) && dev->driver) + struct nd_cmd_clear_error *clear_err = + (struct nd_cmd_clear_error *)data; + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL; + struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL; + struct nd_namespace_common *ndns = NULL; + struct nd_namespace_io *nsio; + resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend; + + if (nd_dax || !dev->driver) + return 0; + + start = clear_err->address; + end = clear_err->address + clear_err->cleared - 1; + + if (nd_btt || nd_pfn || nd_dax) { + if (nd_btt) + ndns = nd_btt->ndns; + else if (nd_pfn) + ndns = nd_pfn->ndns; + else if (nd_dax) + ndns = nd_dax->nd_pfn.ndns; + + if (!ndns) + return 0; + } else + ndns = to_ndns(dev); + + nsio = to_nd_namespace_io(&ndns->dev); + pstart = nsio->res.start + offset; + pend = nsio->res.end - end_trunc; + + if ((pstart >= start) && (pend <= end)) return -EBUSY; + return 0; + +} + +static int nd_ns_forget_poison_check(struct device *dev, void *data) +{ + return device_for_each_child(dev, data, nd_pmem_forget_poison_check); } /* set_config requires an idle interleave set */ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus, - struct nvdimm *nvdimm, unsigned int cmd) + struct nvdimm *nvdimm, unsigned int cmd, void *data) { struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; @@ -792,8 +885,8 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus, /* require clear error to go through the pmem driver */ if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR) - return device_for_each_child(&nvdimm_bus->dev, NULL, - pmem_active); + return device_for_each_child(&nvdimm_bus->dev, data, + nd_ns_forget_poison_check); if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) return 0; @@ -820,7 +913,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, const char *cmd_name, *dimm_name; unsigned long cmd_mask; void *buf; - int rc, i; + int rc, i, cmd_rc; if (nvdimm) { desc = nd_cmd_dimm_desc(cmd); @@ -927,13 +1020,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, } nvdimm_bus_lock(&nvdimm_bus->dev); - rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd); + rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf); if (rc) goto out_unlock; - rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc); if (rc < 0) goto out_unlock; + + if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) { + struct nd_cmd_clear_error *clear_err = buf; + + nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address, + clear_err->cleared); + } nvdimm_bus_unlock(&nvdimm_bus->dev); if (copy_to_user(p, buf, buf_len)) diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index ca6d572c48fc..93d128da1c92 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -21,8 +21,13 @@ void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns) { struct nd_namespace_common *ndns = *_ndns; + struct nvdimm_bus *nvdimm_bus; - lockdep_assert_held(&ndns->dev.mutex); + if (!ndns) + return; + + nvdimm_bus = walk_to_nvdimm_bus(&ndns->dev); + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); dev_WARN_ONCE(dev, ndns->claim != dev, "%s: invalid claim\n", __func__); ndns->claim = NULL; *_ndns = NULL; @@ -37,18 +42,20 @@ void nd_detach_ndns(struct device *dev, if (!ndns) return; get_device(&ndns->dev); - device_lock(&ndns->dev); + nvdimm_bus_lock(&ndns->dev); __nd_detach_ndns(dev, _ndns); - device_unlock(&ndns->dev); + nvdimm_bus_unlock(&ndns->dev); put_device(&ndns->dev); } bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, struct nd_namespace_common **_ndns) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&attach->dev); + if (attach->claim) return false; - lockdep_assert_held(&attach->dev.mutex); + lockdep_assert_held(&nvdimm_bus->reconfig_mutex); dev_WARN_ONCE(dev, *_ndns, "%s: invalid claim\n", __func__); attach->claim = dev; *_ndns = attach; @@ -61,9 +68,9 @@ bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, { bool claimed; - device_lock(&attach->dev); + nvdimm_bus_lock(&attach->dev); claimed = __nd_attach_ndns(dev, attach, _ndns); - device_unlock(&attach->dev); + nvdimm_bus_unlock(&attach->dev); return claimed; } @@ -114,7 +121,7 @@ static void nd_detach_and_reset(struct device *dev, struct nd_namespace_common **_ndns) { /* detach the namespace and destroy / reset the device */ - nd_detach_ndns(dev, _ndns); + __nd_detach_ndns(dev, _ndns); if (is_idle(dev, *_ndns)) { nd_device_unregister(dev, ND_ASYNC); } else if (is_nd_btt(dev)) { @@ -184,7 +191,7 @@ ssize_t nd_namespace_store(struct device *dev, } WARN_ON_ONCE(!is_nvdimm_bus_locked(dev)); - if (!nd_attach_ndns(dev, ndns, _ndns)) { + if (!__nd_attach_ndns(dev, ndns, _ndns)) { dev_dbg(dev, "%s already claimed\n", dev_name(&ndns->dev)); len = -EBUSY; @@ -239,22 +246,24 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, if (rw == READ) { if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) return -EIO; - return memcpy_from_pmem(buf, nsio->addr + offset, size); + return memcpy_mcsafe(buf, nsio->addr + offset, size); } if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { /* * FIXME: nsio_rw_bytes() may be called from atomic - * context in the btt case and nvdimm_clear_poison() - * takes a sleeping lock. Until the locking can be - * reworked this capability requires that the namespace - * is not claimed by btt. + * context in the btt case and the ACPI DSM path for + * clearing the error takes sleeping locks and allocates + * memory. An explicit error clearing path, and support + * for tracking badblocks in BTT metadata is needed to + * work around this collision. */ if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) && (!ndns->claim || !is_nd_btt(ndns->claim))) { long cleared; - cleared = nvdimm_clear_poison(&ndns->dev, offset, size); + cleared = nvdimm_clear_poison(&ndns->dev, + nsio->res.start + offset, size); if (cleared < size) rc = -EIO; if (cleared > 0 && cleared / 512) { diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 9303cfeb8bee..2dee908e4bae 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -518,6 +518,15 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); +static void append_poison_entry(struct nvdimm_bus *nvdimm_bus, + struct nd_poison *pl, u64 addr, u64 length) +{ + lockdep_assert_held(&nvdimm_bus->poison_lock); + pl->start = addr; + pl->length = length; + list_add_tail(&pl->list, &nvdimm_bus->poison_list); +} + static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, gfp_t flags) { @@ -527,19 +536,24 @@ static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, if (!pl) return -ENOMEM; - pl->start = addr; - pl->length = length; - list_add_tail(&pl->list, &nvdimm_bus->poison_list); - + append_poison_entry(nvdimm_bus, pl, addr, length); return 0; } static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) { - struct nd_poison *pl; + struct nd_poison *pl, *pl_new; - if (list_empty(&nvdimm_bus->poison_list)) - return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); + spin_unlock(&nvdimm_bus->poison_lock); + pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL); + spin_lock(&nvdimm_bus->poison_lock); + + if (list_empty(&nvdimm_bus->poison_list)) { + if (!pl_new) + return -ENOMEM; + append_poison_entry(nvdimm_bus, pl_new, addr, length); + return 0; + } /* * There is a chance this is a duplicate, check for those first. @@ -551,6 +565,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) /* If length has changed, update this list entry */ if (pl->length != length) pl->length = length; + kfree(pl_new); return 0; } @@ -559,29 +574,33 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) * as any overlapping ranges will get resolved when the list is consumed * and converted to badblocks */ - return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); + if (!pl_new) + return -ENOMEM; + append_poison_entry(nvdimm_bus, pl_new, addr, length); + + return 0; } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) { int rc; - nvdimm_bus_lock(&nvdimm_bus->dev); + spin_lock(&nvdimm_bus->poison_lock); rc = bus_add_poison(nvdimm_bus, addr, length); - nvdimm_bus_unlock(&nvdimm_bus->dev); + spin_unlock(&nvdimm_bus->poison_lock); return rc; } EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); -void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, - phys_addr_t start, unsigned int len) +void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start, + unsigned int len) { struct list_head *poison_list = &nvdimm_bus->poison_list; u64 clr_end = start + len - 1; struct nd_poison *pl, *next; - nvdimm_bus_lock(&nvdimm_bus->dev); + spin_lock(&nvdimm_bus->poison_lock); WARN_ON_ONCE(list_empty(poison_list)); /* @@ -628,15 +647,15 @@ void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, u64 new_len = pl_end - new_start + 1; /* Add new entry covering the right half */ - add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); + add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT); /* Adjust this entry to cover the left half */ pl->length = start - pl->start; continue; } } - nvdimm_bus_unlock(&nvdimm_bus->dev); + spin_unlock(&nvdimm_bus->poison_lock); } -EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); +EXPORT_SYMBOL_GPL(nvdimm_forget_poison); #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c index 45fa82cae87c..c1b6556aea6e 100644 --- a/drivers/nvdimm/dax_devs.c +++ b/drivers/nvdimm/dax_devs.c @@ -124,7 +124,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns) dev_dbg(dev, "%s: dax: %s\n", __func__, rc == 0 ? dev_name(dax_dev) : "<none>"); if (rc < 0) { - __nd_detach_ndns(dax_dev, &nd_pfn->ndns); + nd_detach_ndns(dax_dev, &nd_pfn->ndns); put_device(dax_dev); } else __nd_device_register(dax_dev); diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index ee0b412827bf..e0f0e3ce1a32 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -49,6 +49,8 @@ static int nvdimm_probe(struct device *dev) kref_init(&ndd->kref); rc = nvdimm_init_nsarea(ndd); + if (rc == -EACCES) + nvdimm_set_locked(dev); if (rc) goto err; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 8b721321be5b..fac1e9fbd11d 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -34,7 +34,7 @@ int nvdimm_check_config_data(struct device *dev) if (!nvdimm->cmd_mask || !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { - if (nvdimm->flags & NDD_ALIASING) + if (test_bit(NDD_ALIASING, &nvdimm->flags)) return -ENXIO; else return -ENOTTY; @@ -67,6 +67,7 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); struct nvdimm_bus_descriptor *nd_desc; int rc = validate_dimm(ndd); + int cmd_rc = 0; if (rc) return rc; @@ -76,8 +77,11 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) memset(cmd, 0, sizeof(*cmd)); nd_desc = nvdimm_bus->nd_desc; - return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), - ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), NULL); + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), &cmd_rc); + if (rc < 0) + return rc; + return cmd_rc; } int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) @@ -188,7 +192,14 @@ void nvdimm_set_aliasing(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); - nvdimm->flags |= NDD_ALIASING; + set_bit(NDD_ALIASING, &nvdimm->flags); +} + +void nvdimm_set_locked(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + set_bit(NDD_LOCKED, &nvdimm->flags); } static void nvdimm_release(struct device *dev) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 1b481a5fb966..2f9dfbd2dbec 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -2236,14 +2236,21 @@ static int init_active_labels(struct nd_region *nd_region) int count, j; /* - * If the dimm is disabled then prevent the region from - * being activated if it aliases DPA. + * If the dimm is disabled then we may need to prevent + * the region from being activated. */ if (!ndd) { - if ((nvdimm->flags & NDD_ALIASING) == 0) + if (test_bit(NDD_LOCKED, &nvdimm->flags)) + /* fail, label data may be unreadable */; + else if (test_bit(NDD_ALIASING, &nvdimm->flags)) + /* fail, labels needed to disambiguate dpa */; + else return 0; - dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", - dev_name(&nd_mapping->nvdimm->dev)); + + dev_err(&nd_region->dev, "%s: is %s, failing probe\n", + dev_name(&nd_mapping->nvdimm->dev), + test_bit(NDD_LOCKED, &nvdimm->flags) + ? "locked" : "disabled"); return -ENXIO; } nd_mapping->ndd = ndd; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 8623e57c2ce3..4c4bd209e725 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -32,6 +32,7 @@ struct nvdimm_bus { struct list_head poison_list; struct list_head mapping_list; struct mutex reconfig_mutex; + spinlock_t poison_lock; }; struct nvdimm { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 2a99c83aa19f..77d032192bf7 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -154,6 +154,7 @@ struct nd_region { u64 ndr_start; int id, num_lanes, ro, numa_node; void *provider_data; + struct badblocks bb; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; struct nd_mapping mapping[0]; @@ -239,6 +240,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, unsigned int len); void nvdimm_set_aliasing(struct device *dev); +void nvdimm_set_locked(struct device *dev); struct nd_btt *to_nd_btt(struct device *dev); struct nd_gen_sb { diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 6c033c9a2f06..335c8175410b 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -484,7 +484,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns) dev_dbg(dev, "%s: pfn: %s\n", __func__, rc == 0 ? dev_name(pfn_dev) : "<none>"); if (rc < 0) { - __nd_detach_ndns(pfn_dev, &nd_pfn->ndns); + nd_detach_ndns(pfn_dev, &nd_pfn->ndns); put_device(pfn_dev); } else __nd_device_register(pfn_dev); @@ -538,7 +538,8 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); altmap = NULL; } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE; + nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) + - offset) / PAGE_SIZE); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", @@ -625,7 +626,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) */ start += start_pad; size = resource_size(&nsio->res); - npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K; + npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - SZ_8K) + / PAGE_SIZE); if (nd_pfn->mode == PFN_MODE_PMEM) { /* * vmemmap_populate_hugepages() allocates the memmap array in diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index fbc640bf06b0..c544d466ea51 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -29,6 +29,7 @@ #include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/pmem.h> +#include <linux/dax.h> #include <linux/nd.h> #include "pmem.h" #include "pfn.h" @@ -89,7 +90,7 @@ static int read_pmem(struct page *page, unsigned int off, int rc; void *mem = kmap_atomic(page); - rc = memcpy_from_pmem(mem + off, pmem_addr, len); + rc = memcpy_mcsafe(mem + off, pmem_addr, len); kunmap_atomic(mem); if (rc) return -EIO; @@ -200,13 +201,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, } /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ -__weak long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct pmem_device *pmem = bdev->bd_queue->queuedata; - resource_size_t offset = sector * 512 + pmem->data_offset; + resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) + if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)))) return -EIO; *kaddr = pmem->virt_addr + offset; *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); @@ -216,17 +217,28 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector, * requested range. */ if (unlikely(pmem->bb.count)) - return size; - return pmem->size - pmem->pfn_pad - offset; + return nr_pages; + return PHYS_PFN(pmem->size - pmem->pfn_pad - offset); } static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .rw_page = pmem_rw_page, - .direct_access = pmem_direct_access, .revalidate_disk = nvdimm_revalidate_disk, }; +static long pmem_dax_direct_access(struct dax_device *dax_dev, + pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); +} + +static const struct dax_operations pmem_dax_ops = { + .direct_access = pmem_dax_direct_access, +}; + static void pmem_release_queue(void *q) { blk_cleanup_queue(q); @@ -237,10 +249,14 @@ static void pmem_freeze_queue(void *q) blk_freeze_queue_start(q); } -static void pmem_release_disk(void *disk) +static void pmem_release_disk(void *__pmem) { - del_gendisk(disk); - put_disk(disk); + struct pmem_device *pmem = __pmem; + + kill_dax(pmem->dax_dev); + put_dax(pmem->dax_dev); + del_gendisk(pmem->disk); + put_disk(pmem->disk); } static int pmem_attach_disk(struct device *dev, @@ -251,6 +267,7 @@ static int pmem_attach_disk(struct device *dev, struct vmem_altmap __altmap, *altmap = NULL; struct resource *res = &nsio->res; struct nd_pfn *nd_pfn = NULL; + struct dax_device *dax_dev; int nid = dev_to_node(dev); struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; @@ -334,6 +351,7 @@ static int pmem_attach_disk(struct device *dev, disk = alloc_disk_node(0, nid); if (!disk) return -ENOMEM; + pmem->disk = disk; disk->fops = &pmem_fops; disk->queue = q; @@ -345,9 +363,16 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; nvdimm_badblocks_populate(nd_region, &pmem->bb, res); disk->bb = &pmem->bb; - device_add_disk(dev, disk); - if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) + dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); + if (!dax_dev) { + put_disk(disk); + return -ENOMEM; + } + pmem->dax_dev = dax_dev; + + device_add_disk(dev, disk); + if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; revalidate_disk(disk); @@ -397,12 +422,12 @@ static void nd_pmem_shutdown(struct device *dev) static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) { - struct pmem_device *pmem = dev_get_drvdata(dev); - struct nd_region *nd_region = to_region(pmem); + struct nd_region *nd_region; resource_size_t offset = 0, end_trunc = 0; struct nd_namespace_common *ndns; struct nd_namespace_io *nsio; struct resource res; + struct badblocks *bb; if (event != NVDIMM_REVALIDATE_POISON) return; @@ -411,20 +436,33 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) struct nd_btt *nd_btt = to_nd_btt(dev); ndns = nd_btt->ndns; - } else if (is_nd_pfn(dev)) { - struct nd_pfn *nd_pfn = to_nd_pfn(dev); - struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + nd_region = to_nd_region(ndns->dev.parent); + nsio = to_nd_namespace_io(&ndns->dev); + bb = &nsio->bb; + } else { + struct pmem_device *pmem = dev_get_drvdata(dev); - ndns = nd_pfn->ndns; - offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); - end_trunc = __le32_to_cpu(pfn_sb->end_trunc); - } else - ndns = to_ndns(dev); + nd_region = to_region(pmem); + bb = &pmem->bb; + + if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + + ndns = nd_pfn->ndns; + offset = pmem->data_offset + + __le32_to_cpu(pfn_sb->start_pad); + end_trunc = __le32_to_cpu(pfn_sb->end_trunc); + } else { + ndns = to_ndns(dev); + } + + nsio = to_nd_namespace_io(&ndns->dev); + } - nsio = to_nd_namespace_io(&ndns->dev); res.start = nsio->res.start + offset; res.end = nsio->res.end - end_trunc; - nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); + nvdimm_badblocks_populate(nd_region, bb, &res); } MODULE_ALIAS("pmem"); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index b4ee4f71b4a1..7f4dbd72a90a 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -5,8 +5,6 @@ #include <linux/pfn_t.h> #include <linux/fs.h> -long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, pfn_t *pfn, long size); /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ @@ -20,5 +18,10 @@ struct pmem_device { /* trim size when namespace capacity has been section aligned */ u32 pfn_pad; struct badblocks bb; + struct dax_device *dax_dev; + struct gendisk *disk; }; + +long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn); #endif /* __NVDIMM_PMEM_H__ */ diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 8f241772ec0b..869a886c292e 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/nd.h> +#include "nd-core.h" #include "nd.h" static int nd_region_probe(struct device *dev) @@ -52,6 +53,17 @@ static int nd_region_probe(struct device *dev) if (rc && err && rc == err) return -ENODEV; + if (is_nd_pmem(&nd_region->dev)) { + struct resource ndr_res; + + if (devm_init_badblocks(dev, &nd_region->bb)) + return -ENODEV; + ndr_res.start = nd_region->ndr_start; + ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; + nvdimm_badblocks_populate(nd_region, + &nd_region->bb, &ndr_res); + } + nd_region->btt_seed = nd_btt_create(nd_region); nd_region->pfn_seed = nd_pfn_create(nd_region); nd_region->dax_seed = nd_dax_create(nd_region); @@ -104,6 +116,18 @@ static int child_notify(struct device *dev, void *data) static void nd_region_notify(struct device *dev, enum nvdimm_event event) { + if (event == NVDIMM_REVALIDATE_POISON) { + struct nd_region *nd_region = to_nd_region(dev); + struct resource res; + + if (is_nd_pmem(&nd_region->dev)) { + res.start = nd_region->ndr_start; + res.end = nd_region->ndr_start + + nd_region->ndr_size - 1; + nvdimm_badblocks_populate(nd_region, + &nd_region->bb, &res); + } + } device_for_each_child(dev, &event, child_notify); } diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index b7cb5066d961..b550edf2571f 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -222,7 +222,7 @@ int nd_region_to_nstype(struct nd_region *nd_region) struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm *nvdimm = nd_mapping->nvdimm; - if (nvdimm->flags & NDD_ALIASING) + if (test_bit(NDD_ALIASING, &nvdimm->flags)) alias++; } if (alias) @@ -255,6 +255,35 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR_RO(size); +static ssize_t deep_flush_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + /* + * NOTE: in the nvdimm_has_flush() error case this attribute is + * not visible. + */ + return sprintf(buf, "%d\n", nvdimm_has_flush(nd_region)); +} + +static ssize_t deep_flush_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + bool flush; + int rc = strtobool(buf, &flush); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + if (!flush) + return -EINVAL; + nvdimm_flush(nd_region); + + return len; +} +static DEVICE_ATTR_RW(deep_flush); + static ssize_t mappings_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -448,6 +477,25 @@ static ssize_t read_only_store(struct device *dev, } static DEVICE_ATTR_RW(read_only); +static ssize_t region_badblocks_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return badblocks_show(&nd_region->bb, buf, 0); +} + +static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%#llx\n", nd_region->ndr_start); +} +static DEVICE_ATTR_RO(resource); + static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, &dev_attr_nstype.attr, @@ -455,11 +503,14 @@ static struct attribute *nd_region_attributes[] = { &dev_attr_btt_seed.attr, &dev_attr_pfn_seed.attr, &dev_attr_dax_seed.attr, + &dev_attr_deep_flush.attr, &dev_attr_read_only.attr, &dev_attr_set_cookie.attr, &dev_attr_available_size.attr, &dev_attr_namespace_seed.attr, &dev_attr_init_namespaces.attr, + &dev_attr_badblocks.attr, + &dev_attr_resource.attr, NULL, }; @@ -476,6 +527,23 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr) return 0; + if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) + return 0; + + if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) + return 0; + + if (a == &dev_attr_deep_flush.attr) { + int has_flush = nvdimm_has_flush(nd_region); + + if (has_flush == 1) + return a->mode; + else if (has_flush == 0) + return 0444; + else + return 0; + } + if (a != &dev_attr_set_cookie.attr && a != &dev_attr_available_size.attr) return a->mode; @@ -813,7 +881,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, return NULL; } - if (nvdimm->flags & NDD_UNARMED) + if (test_bit(NDD_UNARMED, &nvdimm->flags)) ro = 1; } @@ -968,17 +1036,20 @@ EXPORT_SYMBOL_GPL(nvdimm_flush); */ int nvdimm_has_flush(struct nd_region *nd_region) { - struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev); int i; /* no nvdimm == flushing capability unknown */ if (nd_region->ndr_mappings == 0) return -ENXIO; - for (i = 0; i < nd_region->ndr_mappings; i++) - /* flush hints present, flushing required */ - if (ndrd_get_flush_wpq(ndrd, i, 0)) + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + /* flush hints present / available */ + if (nvdimm->num_flush) return 1; + } /* * The platform defines dimm devices without hints, assume diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 4a3b62326183..0acb8c2f9475 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -14,6 +14,7 @@ config BLK_DEV_XPRAM config DCSSBLK def_tristate m + select DAX prompt "DCSSBLK support" depends on S390 && BLOCK help diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 415d10a67b7a..36e5280af3e4 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/pfn_t.h> +#include <linux/dax.h> #include <asm/extmem.h> #include <asm/io.h> @@ -30,8 +31,8 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static blk_qc_t dcssblk_make_request(struct request_queue *q, struct bio *bio); -static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, - void **kaddr, pfn_t *pfn, long size); +static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn); static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; @@ -40,7 +41,10 @@ static const struct block_device_operations dcssblk_devops = { .owner = THIS_MODULE, .open = dcssblk_open, .release = dcssblk_release, - .direct_access = dcssblk_direct_access, +}; + +static const struct dax_operations dcssblk_dax_ops = { + .direct_access = dcssblk_dax_direct_access, }; struct dcssblk_dev_info { @@ -57,6 +61,7 @@ struct dcssblk_dev_info { struct request_queue *dcssblk_queue; int num_of_segments; struct list_head seg_list; + struct dax_device *dax_dev; }; struct segment_info { @@ -389,6 +394,8 @@ removeseg: } list_del(&dev_info->lh); + kill_dax(dev_info->dax_dev); + put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); blk_cleanup_queue(dev_info->dcssblk_queue); dev_info->gd->queue = NULL; @@ -654,6 +661,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; + dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, + &dcssblk_dax_ops); + if (!dev_info->dax_dev) { + rc = -ENOMEM; + goto put_dev; + } + get_device(&dev_info->dev); device_add_disk(&dev_info->dev, dev_info->gd); @@ -752,6 +766,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch } list_del(&dev_info->lh); + kill_dax(dev_info->dax_dev); + put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); blk_cleanup_queue(dev_info->dcssblk_queue); dev_info->gd->queue = NULL; @@ -883,21 +899,26 @@ fail: } static long -dcssblk_direct_access (struct block_device *bdev, sector_t secnum, - void **kaddr, pfn_t *pfn, long size) +__dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) { - struct dcssblk_dev_info *dev_info; - unsigned long offset, dev_sz; + resource_size_t offset = pgoff * PAGE_SIZE; + unsigned long dev_sz; - dev_info = bdev->bd_disk->private_data; - if (!dev_info) - return -ENODEV; dev_sz = dev_info->end - dev_info->start + 1; - offset = secnum * 512; *kaddr = (void *) dev_info->start + offset; *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV); - return dev_sz - offset; + return (dev_sz - offset) / PAGE_SIZE; +} + +static long +dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev); + + return __dcssblk_direct_access(dev_info, pgoff, nr_pages, kaddr, pfn); } static void |