summaryrefslogtreecommitdiff
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c369
1 files changed, 280 insertions, 89 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 7fd4cd9a51f2..af5045b0f476 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT;
}
+static struct folio *dax_to_folio(void *entry)
+{
+ return page_folio(pfn_to_page(dax_to_pfn(entry)));
+}
+
static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
@@ -206,7 +211,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry,
*
* Must be called with the i_pages lock held.
*/
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
{
void *entry;
struct wait_exceptional_entry_queue ewait;
@@ -236,6 +241,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
}
/*
+ * Wait for the given entry to become unlocked. Caller must hold the i_pages
+ * lock and call either put_unlocked_entry() if it did not lock the entry or
+ * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
+ */
+static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
+{
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ while (unlikely(dax_is_locked(entry))) {
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ xas_pause(xas);
+ xas_unlock_irq(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ xas_lock_irq(xas);
+ entry = xas_load(xas);
+ }
+
+ if (xa_is_internal(entry))
+ return NULL;
+
+ return entry;
+}
+
+/*
* The only thing keeping the address space around is the i_pages lock
* (it's cycled in clear_inode() after removing the entries from i_pages)
* After we call xas_unlock_irq(), we cannot touch xas->xa.
@@ -250,7 +286,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
/*
- * Unlike get_unlocked_entry() there is no guarantee that this
+ * Unlike get_next_unlocked_entry() there is no guarantee that this
* path ever successfully retrieves an unlocked entry before an
* inode dies. Perform a non-exclusive wait in case this path
* never successfully performs its own wake up.
@@ -307,109 +343,156 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_end_pfn(void *entry)
+/*
+ * A DAX folio is considered shared if it has no mapping set and ->share (which
+ * shares the ->index field) is non-zero. Note this may return false even if the
+ * page is shared between multiple files but has not yet actually been mapped
+ * into multiple address spaces.
+ */
+static inline bool dax_folio_is_shared(struct folio *folio)
{
- return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return !folio->mapping && folio->share;
}
/*
- * Iterate through all mapped pfns represented by an entry, i.e. skip
- * 'empty' and 'zero' entries.
+ * When it is called by dax_insert_entry(), the shared flag will indicate
+ * whether this entry is shared by multiple files. If the page has not
+ * previously been associated with any mappings the ->mapping and ->index
+ * fields will be set. If it has already been associated with a mapping
+ * the mapping will be cleared and the share count set. It's then up to
+ * reverse map users like memory_failure() to call back into the filesystem to
+ * recover ->mapping and ->index information. For example by implementing
+ * dax_holder_operations.
*/
-#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_to_pfn(entry); \
- pfn < dax_end_pfn(entry); pfn++)
-
-static inline bool dax_page_is_shared(struct page *page)
+static void dax_folio_make_shared(struct folio *folio)
{
- return page->mapping == PAGE_MAPPING_DAX_SHARED;
+ /*
+ * folio is not currently shared so mark it as shared by clearing
+ * folio->mapping.
+ */
+ folio->mapping = NULL;
+
+ /*
+ * folio has previously been mapped into one address space so set the
+ * share count.
+ */
+ folio->share = 1;
}
-/*
- * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
- * refcount.
- */
-static inline void dax_page_share_get(struct page *page)
+static inline unsigned long dax_folio_put(struct folio *folio)
{
- if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
+ unsigned long ref;
+ int order, i;
+
+ if (!dax_folio_is_shared(folio))
+ ref = 0;
+ else
+ ref = --folio->share;
+
+ if (ref)
+ return ref;
+
+ folio->mapping = NULL;
+ order = folio_order(folio);
+ if (!order)
+ return 0;
+
+ for (i = 0; i < (1UL << order); i++) {
+ struct dev_pagemap *pgmap = page_pgmap(&folio->page);
+ struct page *page = folio_page(folio, i);
+ struct folio *new_folio = (struct folio *)page;
+
+ ClearPageHead(page);
+ clear_compound_head(page);
+
+ new_folio->mapping = NULL;
/*
- * Reset the index if the page was already mapped
- * regularly before.
+ * Reset pgmap which was over-written by
+ * prep_compound_page().
*/
- if (page->mapping)
- page->share = 1;
- page->mapping = PAGE_MAPPING_DAX_SHARED;
+ new_folio->pgmap = pgmap;
+ new_folio->share = 0;
+ WARN_ON_ONCE(folio_ref_count(new_folio));
}
- page->share++;
+
+ return ref;
}
-static inline unsigned long dax_page_share_put(struct page *page)
+static void dax_folio_init(void *entry)
{
- return --page->share;
+ struct folio *folio = dax_to_folio(entry);
+ int order = dax_entry_order(entry);
+
+ /*
+ * Folio should have been split back to order-0 pages in
+ * dax_folio_put() when they were removed from their
+ * final mapping.
+ */
+ WARN_ON_ONCE(folio_order(folio));
+
+ if (order > 0) {
+ prep_compound_page(&folio->page, order);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ WARN_ON_ONCE(folio_ref_count(folio));
+ }
}
-/*
- * When it is called in dax_insert_entry(), the shared flag will indicate that
- * whether this entry is shared by multiple files. If so, set the page->mapping
- * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
- */
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool shared)
+ struct vm_area_struct *vma,
+ unsigned long address, bool shared)
{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
+ unsigned long size = dax_entry_size(entry), index;
+ struct folio *folio = dax_to_folio(entry);
+
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
+ return;
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
+ if (folio->mapping)
+ dax_folio_make_shared(folio);
- if (shared) {
- dax_page_share_get(page);
- } else {
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
+ WARN_ON_ONCE(!folio->share);
+ WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
+ folio->share++;
+ } else {
+ WARN_ON_ONCE(folio->mapping);
+ dax_folio_init(entry);
+ folio = dax_to_folio(entry);
+ folio->mapping = mapping;
+ folio->index = index;
}
}
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
+ bool trunc)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_page_is_shared(page)) {
- /* keep the shared flag if this page is still shared */
- if (dax_page_share_put(page) > 0)
- continue;
- } else
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
+ return;
+
+ dax_folio_put(folio);
}
static struct page *dax_busy_page(void *entry)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
+ return NULL;
- if (page_ref_count(page) > 1)
- return page;
- }
- return NULL;
+ if (folio_ref_count(folio) - folio_mapcount(folio))
+ return &folio->page;
+ else
+ return NULL;
}
/**
@@ -580,7 +663,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
retry:
pmd_downgrade = false;
xas_lock_irq(xas);
- entry = get_unlocked_entry(xas, order);
+ entry = get_next_unlocked_entry(xas, order);
if (entry) {
if (dax_is_conflict(entry))
@@ -690,7 +773,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return NULL;
- if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+ if (!dax_mapping(mapping))
return NULL;
/* If end == LLONG_MAX, all pages from start to till end of file */
@@ -716,8 +799,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
- if (unlikely(dax_is_locked(entry)))
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
if (entry)
page = dax_busy_page(entry);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -743,14 +825,14 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
- pgoff_t index, bool trunc)
+ pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, 0);
+ entry = get_next_unlocked_entry(&xas, 0);
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
@@ -776,7 +858,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping,
xas_lock_irq(&xas);
xas_for_each(&xas, entry, end) {
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -813,6 +897,107 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
return ret;
}
+void dax_delete_mapping_range(struct address_space *mapping,
+ loff_t start, loff_t end)
+{
+ void *entry;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
+
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
+
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, end_idx) {
+ if (!xa_is_value(entry))
+ continue;
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
+ dax_disassociate_entry(entry, mapping, true);
+ xas_store(&xas, NULL);
+ mapping->nrpages -= 1UL << dax_entry_order(entry);
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
+ }
+ xas_unlock_irq(&xas);
+}
+EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
+
+static int wait_page_idle(struct page *page,
+ void (cb)(struct inode *),
+ struct inode *inode)
+{
+ return ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+}
+
+static void wait_page_idle_uninterruptible(struct page *page,
+ struct inode *inode)
+{
+ ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_UNINTERRUPTIBLE, 0, 0, schedule());
+}
+
+/*
+ * Unmaps the inode and waits for any DMA to complete prior to deleting the
+ * DAX mapping entries for the range.
+ *
+ * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
+ * busy page
+ */
+int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
+ void (cb)(struct inode *))
+{
+ struct page *page;
+ int error = 0;
+
+ if (!dax_mapping(inode->i_mapping))
+ return 0;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ break;
+ if (!cb) {
+ error = -ERESTARTSYS;
+ break;
+ }
+
+ error = wait_page_idle(page, cb, inode);
+ } while (error == 0);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, start, end);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(dax_break_layout);
+
+void dax_break_layout_final(struct inode *inode)
+{
+ struct page *page;
+
+ if (!dax_mapping(inode->i_mapping))
+ return;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, 0,
+ LLONG_MAX);
+ if (!page)
+ break;
+
+ wait_page_idle_uninterruptible(page, inode);
+ } while (true);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_break_layout_final);
+
/*
* Invalidate DAX entry if it is clean.
*/
@@ -895,8 +1080,9 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- shared);
+ dax_associate_entry(new_entry, mapping, vmf->vma,
+ vmf->address, shared);
+
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -940,7 +1126,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
if (unlikely(dax_is_locked(entry))) {
void *old_entry = entry;
- entry = get_unlocked_entry(xas, 0);
+ entry = get_next_unlocked_entry(xas, 0);
/* Entry got punched out / reallocated? */
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1084,9 +1270,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
goto out;
if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
goto out;
- /* For larger pages we need devmap */
- if (length > 1 && !pfn_t_devmap(*pfnp))
- goto out;
+
rc = 0;
out_check_addr:
@@ -1193,7 +1377,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
- ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -1664,7 +1848,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
- int err = 0;
+ struct folio *folio;
+ int ret, err = 0;
pfn_t pfn;
void *kaddr;
@@ -1696,17 +1881,19 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return dax_fault_return(err);
}
+ folio = dax_to_folio(*entry);
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
- /* insert PMD pfn */
+ folio_ref_inc(folio);
if (pmd)
- return vmf_insert_pfn_pmd(vmf, pfn, write);
+ ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
+ write);
+ else
+ ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
+ folio_put(folio);
- /* insert PTE pfn */
- if (write)
- return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ return ret;
}
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
@@ -1949,11 +2136,12 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ struct folio *folio;
void *entry;
vm_fault_t ret;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, order);
+ entry = get_next_unlocked_entry(&xas, order);
/* Did we race with someone splitting entry or so? */
if (!entry || dax_is_conflict(entry) ||
(order == 0 && !dax_is_pte_entry(entry))) {
@@ -1966,14 +2154,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
+ folio = pfn_folio(pfn_t_to_pfn(pfn));
+ folio_ref_inc(folio);
if (order == 0)
- ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
#ifdef CONFIG_FS_DAX_PMD
else if (order == PMD_ORDER)
- ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+ ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
#endif
else
ret = VM_FAULT_FALLBACK;
+ folio_put(folio);
dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;