summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c939
1 files changed, 577 insertions, 362 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index c6c4d4ea29a7..67bb273dfb80 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -672,7 +672,7 @@ void try_to_unmap_flush_dirty(void)
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -681,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
/*
@@ -757,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
}
@@ -889,7 +889,7 @@ static bool folio_referenced_one(struct folio *folio,
if ((!atomic_read(&vma->vm_mm->mm_users) ||
check_stable_address_space(vma->vm_mm)) &&
folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_likely_mapped_shared(folio)) {
+ !folio_maybe_mapped_shared(folio)) {
pra->referenced = -1;
page_vma_mapped_walk_done(&pvmw);
return false;
@@ -1044,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are clean and not writable from a
+ * CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (!pte_present(entry))
+ continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
@@ -1127,6 +1135,80 @@ int folio_mkclean(struct folio *folio)
}
EXPORT_SYMBOL_GPL(folio_mkclean);
+struct wrprotect_file_state {
+ int cleaned;
+ pgoff_t pgoff;
+ unsigned long pfn;
+ unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_range_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+ struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = state->pfn,
+ .nr_pages = state->nr_pages,
+ .pgoff = state->pgoff,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+
+ state->cleaned += page_vma_mkclean_one(&pvmw);
+
+ return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
+ *
+ * @mapping: The mapping whose reverse mapping should be traversed.
+ * @pgoff: The page offset at which @pfn is mapped within @mapping.
+ * @pfn: The PFN of the page mapped in @mapping at @pgoff.
+ * @nr_pages: The number of physically contiguous base pages spanned.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the pages in the specified range in @mapping, and write-protects
+ * them (that is, updates the page tables to mark the mappings read-only such
+ * that a write protection fault arises when the mappings are written to).
+ *
+ * The @pfn value need not refer to a folio, but rather can reference a kernel
+ * allocation which is mapped into userland. We therefore do not require that
+ * the page maps to a folio with a valid mapping or index field, rather the
+ * caller specifies these in @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages)
+{
+ struct wrprotect_file_state state = {
+ .cleaned = 0,
+ .pgoff = pgoff,
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&state,
+ .rmap_one = mapping_wrprotect_range_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ if (!mapping)
+ return 0;
+
+ __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+ /* locked = */false);
+
+ return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
+
/**
* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
@@ -1160,8 +1242,8 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
}
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level,
- int *nr_pmdmapped)
+ struct page *page, int nr_pages, struct vm_area_struct *vma,
+ enum rmap_level level, int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
@@ -1176,6 +1258,16 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
break;
}
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
+ if (nr == orig_nr_pages)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
do {
first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1184,15 +1276,34 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
nr = first;
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ if (level == RMAP_LEVEL_PMD && first)
+ *nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_inc_return_large_mapcount(folio, vma);
+ if (nr == 1)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
- *nr_pmdmapped = folio_nr_pages(folio);
- nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ /*
+ * We only track PMD mappings of PMD-sized
+ * folios separately.
+ */
+ if (level == RMAP_LEVEL_PMD)
+ *nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1201,7 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, vma);
break;
}
return nr;
@@ -1322,7 +1433,7 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
@@ -1338,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous
+ * PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
+
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
+ atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
- /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
- VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
- (folio_test_large(folio) &&
- folio_entire_mapcount(folio) > 1)) &&
+ VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+ folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(cur_page), folio);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ continue;
+
+ /*
+ * While PTE-mapping a THP we have a PMD and a PTE
+ * mapping.
+ */
+ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
@@ -1426,14 +1554,11 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
- const int nr = folio_nr_pages(folio);
const bool exclusive = flags & RMAP_EXCLUSIVE;
- int nr_pmdmapped = 0;
+ int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
- VM_BUG_ON_VMA(address < vma->vm_start ||
- address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
/*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
@@ -1451,29 +1576,35 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
} else if (!folio_test_pmd_mappable(folio)) {
int i;
+ nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(page);
}
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, nr - 1);
- atomic_set(&folio->_nr_pages_mapped, nr);
+ folio_set_large_mapcount(folio, nr, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, nr);
} else {
+ nr = folio_large_nr_pages(folio);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
+ folio_set_large_mapcount(folio, 1, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
if (exclusive)
SetPageAnonExclusive(&folio->page);
nr_pmdmapped = nr;
}
+ VM_WARN_ON_ONCE(address < vma->vm_start ||
+ address + (nr << PAGE_SHIFT) > vma->vm_end);
+
__folio_mod_stat(folio, nr, nr_pmdmapped);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
@@ -1486,7 +1617,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
__folio_mod_stat(folio, nr, nr_pmdmapped);
/* See comments in folio_add_anon_rmap_*() */
@@ -1531,6 +1662,27 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
@@ -1548,7 +1700,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
break;
}
- atomic_sub(nr_pages, &folio->_large_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_nr_pages(folio);
+ } else {
+ partially_mapped = nr < folio_large_nr_pages(folio) &&
+ !folio_entire_mapcount(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_sub_large_mapcount(folio, nr_pages, vma);
do {
last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1560,13 +1725,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
- atomic_dec(&folio->_large_mapcount);
+ case RMAP_LEVEL_PUD:
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (level == RMAP_LEVEL_PMD && last)
+ nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_dec_return_large_mapcount(folio, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ } else {
+ partially_mapped = last &&
+ nr < folio_large_nr_pages(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
- nr_pmdmapped = folio_nr_pages(folio);
- nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ if (level == RMAP_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1640,6 +1824,46 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+ struct folio *folio, pte_t *ptep)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = folio_nr_pages(folio);
+ pte_t pte = ptep_get(ptep);
+
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return false;
+ if (pte_unused(pte))
+ return false;
+ if (pte_pfn(pte) != folio_pfn(folio))
+ return false;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ NULL, NULL) == max_nr;
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1648,11 +1872,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1702,9 +1927,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
if (!pvmw.pte) {
- if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
- folio))
- goto walk_done;
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+ if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+ goto walk_done;
+ /*
+ * unmap_huge_pmd_locked has either already marked
+ * the folio as swap-backed or decided to retain it
+ * due to GUP or speculative references.
+ */
+ goto walk_abort;
+ }
if (flags & TTU_SPLIT_HUGE_PMD) {
/*
@@ -1722,7 +1954,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
+ } else {
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+ }
+
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1778,24 +2021,33 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else if (likely(pte_present(pteval))) {
+ if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+ can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+ nr_pages = folio_nr_pages(folio);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
}
/*
@@ -1805,10 +2057,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
@@ -1822,8 +2070,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -1868,40 +2116,41 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
smp_rmb();
- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- (!folio_test_dirty(folio) ||
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE
- * ones can be dropped even if they've
- * been dirtied.
- */
- (vma->vm_flags & VM_DROPPABLE))) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
- }
-
- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE ones
- * never get swap backed on failure to drop.
- */
- if (!(vma->vm_flags & VM_DROPPABLE))
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ /*
+ * redirtied either using the page table or a previously
+ * obtained GUP reference.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
- goto walk_abort;
+ goto walk_abort;
+ } else if (ref_count != 1 + map_count) {
+ /*
+ * Additional reference. Could be a GUP reference or any
+ * speculative reference. GUP users must mark the folio
+ * dirty if there was a modification. This folio cannot be
+ * reclaimed right now either way, so act just like nothing
+ * happened.
+ * We'll come back here later and detect if the folio was
+ * dirtied when the additional reference is gone.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ goto walk_abort;
+ }
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ goto discard;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
+
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +2175,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
@@ -1946,13 +2202,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ folio_ref_sub(folio, nr_pages - 1);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ /* We have already batched the entire folio */
+ if (nr_pages > 1)
+ goto walk_done;
continue;
walk_abort:
ret = false;
@@ -2013,9 +2274,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
@@ -2082,24 +2343,19 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
-
- if (folio_is_zone_device(folio)) {
- /*
- * Our PTE is a non-present device exclusive entry and
- * calculating the subpage as for the common case would
- * result in an invalid pointer.
- *
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
- subpage = &folio->page;
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
} else {
- subpage = folio_page(folio, pfn - folio_pfn(folio));
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -2155,7 +2411,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
@@ -2169,58 +2428,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval, address);
+ set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
+ writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
}
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
+ VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+ !anon_exclusive, folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_device_private(folio)) {
- unsigned long pfn = folio_pfn(folio);
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (anon_exclusive)
- WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
- subpage));
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = pte_to_swp_entry(pteval);
- if (is_writable_device_private_entry(entry))
- entry = make_writable_migration_entry(pfn);
- else if (anon_exclusive)
- entry = make_readable_exclusive_migration_entry(pfn);
- else
- entry = make_readable_migration_entry(pfn);
- swp_pte = swp_entry_to_pte(entry);
+ if (PageHWPoison(subpage)) {
+ VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- folio_order(folio));
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2230,8 +2458,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -2247,6 +2475,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
@@ -2257,8 +2490,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
- !anon_exclusive, subpage);
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (folio_test_hugetlb(folio)) {
@@ -2283,7 +2514,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- if (pte_write(pteval))
+ if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
@@ -2292,15 +2523,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
- if (pte_young(pteval))
- entry = make_migration_entry_young(entry);
- if (pte_dirty(pteval))
- entry = make_migration_entry_dirty(entry);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
@@ -2375,190 +2614,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
- struct mm_struct *mm;
- unsigned long address;
- void *owner;
- bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long address, void *priv)
+/**
+ * make_device_exclusive() - Mark a page for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @addr: the virtual address to mark for exclusive device access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
+ *
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
+ *
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the folio lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
+ */
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- struct make_exclusive_args *args = priv;
- pte_t pteval;
- struct page *subpage;
- bool ret = true;
struct mmu_notifier_range range;
+ struct folio *folio, *fw_folio;
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct page *page;
swp_entry_t entry;
pte_t swp_pte;
- pte_t ptent;
-
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
- vma->vm_mm, address, min(vma->vm_end,
- address + folio_size(folio)),
- args->owner);
- mmu_notifier_invalidate_range_start(&range);
-
- while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
- ptent = ptep_get(pvmw.pte);
- if (!pte_present(ptent)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- subpage = folio_page(folio,
- pte_pfn(ptent) - folio_pfn(folio));
- address = pvmw.address;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(ptent));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
- /*
- * Check that our target page is still mapped at the expected
- * address.
- */
- if (args->mm == mm && args->address == address &&
- pte_write(pteval))
- args->valid = true;
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_device_exclusive_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_device_exclusive_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ int ret;
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ mmap_assert_locked(mm);
+ addr = PAGE_ALIGN_DOWN(addr);
- /*
- * There is a reference on the page for the swap entry which has
- * been removed, so shouldn't take another.
- */
- folio_remove_rmap_pte(folio, subpage, vma);
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ *
+ * If any other device would already map this page exclusively, the
+ * fault will trigger a conversion to an ordinary
+ * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
+ */
+retry:
+ page = get_user_page_vma_remote(mm, addr,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ &vma);
+ if (IS_ERR(page))
+ return page;
+ folio = page_folio(page);
+
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
}
- mmu_notifier_invalidate_range_end(&range);
-
- return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
- struct mm_struct *mm, unsigned long address, void *owner)
-{
- struct make_exclusive_args args = {
- .mm = mm,
- .address = address,
- .owner = owner,
- .valid = false,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_make_device_exclusive_one,
- .done = folio_not_mapped,
- .anon_lock = folio_lock_anon_vma_read,
- .arg = &args,
- };
+ ret = folio_lock_killable(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
/*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
+ * Inform secondary MMUs that we are going to convert this PTE to
+ * device-exclusive, such that they unmap it now. Note that the
+ * caller must filter this event out to prevent livelocks.
*/
- if (!folio_test_anon(folio))
- return false;
-
- rmap_walk(folio, &rwc);
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mm, addr, addr + PAGE_SIZE, owner);
+ mmu_notifier_invalidate_range_start(&range);
- return args.valid && !folio_mapcount(folio);
-}
+ /*
+ * Let's do a second walk and make sure we still find the same page
+ * mapped writable. Note that any page of an anonymous folio can
+ * only be mapped writable using exactly one PTE ("exclusive"), so
+ * there cannot be other mappings.
+ */
+ fw_folio = folio_walk_start(&fw, vma, addr, 0);
+ if (fw_folio != folio || fw.page != page ||
+ fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+ if (fw_folio)
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry;
+ }
-/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
- *
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
- *
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
- *
- * A driver using this to program access from a device must use a mmu notifier
- * critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
- * which point CPU access to the page will revoke the exclusive access.
- */
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
-{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
-
- npages = get_user_pages_remote(mm, start, npages,
- FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ /* Nuke the page table entry so we get the uptodate dirty bit. */
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
- }
+ /* Set the dirty flag on the folio now the PTE is gone. */
+ if (pte_dirty(fw.pte))
+ folio_mark_dirty(folio);
- return npages;
+ /*
+ * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+ * do_swap_page() will trigger the conversion back while holding the
+ * folio lock.
+ */
+ entry = make_device_exclusive_entry(page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(fw.pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ /* The pte is writable, uffd-wp does not apply. */
+ set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -2653,35 +2841,37 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_unlock_read(anon_vma);
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @folio: the folio to be handled
- * @rwc: control variable according to each walk type
- * @locked: caller holds relevant rmap lock
+/**
+ * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
+ * of a page mapped within a specified page cache object at a specified offset.
*
- * Find all the mappings of a folio using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
+ * @folio: Either the folio whose mappings to traverse, or if NULL,
+ * the callbacks specified in @rwc will be configured such
+ * as to be able to look up mappings correctly.
+ * @mapping: The page cache object whose mapping VMAs we intend to
+ * traverse. If @folio is non-NULL, this should be equal to
+ * folio_mapping(folio).
+ * @pgoff_start: The offset within @mapping of the page which we are
+ * looking up. If @folio is non-NULL, this should be equal
+ * to folio_pgoff(folio).
+ * @nr_pages: The number of pages mapped by the mapping. If @folio is
+ * non-NULL, this should be equal to folio_nr_pages(folio).
+ * @rwc: The reverse mapping walk control object describing how
+ * the traversal should proceed.
+ * @locked: Is the @mapping already locked? If not, we acquire the
+ * lock.
*/
-static void rmap_walk_file(struct folio *folio,
- struct rmap_walk_control *rwc, bool locked)
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = folio_mapping(folio);
- pgoff_t pgoff_start, pgoff_end;
+ pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_rwsem.
- */
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
+ VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
+ VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
- if (!mapping)
- return;
-
- pgoff_start = folio_pgoff(folio);
- pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
@@ -2696,8 +2886,7 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(vma, pgoff_start,
- folio_nr_pages(folio));
+ unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2710,12 +2899,38 @@ lookup:
if (rwc->done && rwc->done(folio))
goto done;
}
-
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @folio: the folio to be handled
+ * @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
+ *
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ */
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
+{
+ /*
+ * The folio lock not only makes sure that folio->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the structure
+ * at mapping cannot be freed and reused yet, so we can safely take
+ * mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (!folio->mapping)
+ return;
+
+ __rmap_walk_file(folio, folio->mapping, folio->index,
+ folio_nr_pages(folio), rwc, locked);
+}
+
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))