summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c314
1 files changed, 203 insertions, 111 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 398c031be9ba..b43428ae76cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1466,34 +1466,42 @@ static inline bool zap_drop_markers(struct zap_details *details)
/*
* This function makes sure that we'll replace the none pte with an uffd-wp
* swap special pte marker when necessary. Must be with the pgtable lock held.
+ *
+ * Returns true if uffd-wp ptes was installed, false otherwise.
*/
-static inline void
+static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
+ bool was_installed = false;
+
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
- return;
+ return false;
if (zap_drop_markers(details))
- return;
+ return false;
for (;;) {
/* the PFN in the PTE is irrelevant. */
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
+ was_installed = true;
if (--nr == 0)
break;
pte++;
addr += PAGE_SIZE;
}
+#endif
+ return was_installed;
}
static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
unsigned long addr, struct zap_details *details, int *rss,
- bool *force_flush, bool *force_break)
+ bool *force_flush, bool *force_break, bool *any_skipped)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
@@ -1519,8 +1527,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
- ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
+ nr, details, ptent);
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
@@ -1544,7 +1552,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ bool *force_break, bool *any_skipped)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
@@ -1559,15 +1567,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
if (userfaultfd_pte_wp(vma, ptent))
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
- details, ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
+ pte, 1, details, ptent);
ksm_might_unmap_zero_page(mm, ptent);
return 1;
}
folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
+ if (unlikely(!should_zap_folio(details, folio))) {
+ *any_skipped = true;
return 1;
+ }
/*
* Make sure that the common "small folio" case is as fast as possible
@@ -1579,14 +1589,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
- force_break);
+ force_break, any_skipped);
return nr;
}
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
- details, rss, force_flush, force_break);
+ details, rss, force_flush, force_break, any_skipped);
return 1;
}
+static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *any_skipped)
+{
+ swp_entry_t entry;
+ int nr = 1;
+
+ *any_skipped = true;
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(page);
+
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ rss[mm_counter(folio)]--;
+ if (is_device_private_entry(entry))
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ return 1;
+
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ struct folio *folio = pfn_swap_entry_folio(entry);
+
+ if (!should_zap_folio(details, folio))
+ return 1;
+ rss[mm_counter(folio)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
+ return 1;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
+ return 1;
+ } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ if (!should_zap_cows(details))
+ return 1;
+ } else {
+ /* We should have covered all the swap entry types */
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+ clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+
+ return nr;
+}
+
+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break,
+ bool *any_skipped)
+{
+ pte_t ptent = ptep_get(pte);
+ int max_nr = (end - addr) / PAGE_SIZE;
+ int nr = 0;
+
+ /* Skip all consecutive none ptes */
+ if (pte_none(ptent)) {
+ for (nr = 1; nr < max_nr; nr++) {
+ ptent = ptep_get(pte + nr);
+ if (!pte_none(ptent))
+ break;
+ }
+ max_nr -= nr;
+ if (!max_nr)
+ return nr;
+ pte += nr;
+ addr += nr * PAGE_SIZE;
+ }
+
+ if (pte_present(ptent))
+ nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, force_flush, force_break,
+ any_skipped);
+ else
+ nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, any_skipped);
+
+ return nr;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1598,9 +1715,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = true;
int nr;
+retry:
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1610,90 +1731,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = ptep_get(pte);
- struct folio *folio;
- struct page *page;
- int max_nr;
+ bool any_skipped = false;
- nr = 1;
- if (pte_none(ptent))
- continue;
-
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
-
- if (pte_present(ptent)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
- addr, details, rss, &force_flush,
- &force_break);
- if (unlikely(force_break)) {
- addr += nr * PAGE_SIZE;
- break;
- }
- continue;
}
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
- folio_put(folio);
- } else if (!non_swap_entry(entry)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = swap_pte_batch(pte, max_nr, ptent);
- /* Genuine swap entries, hence a private anon pages */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
- if (!should_zap_folio(details, folio))
- continue;
- rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /*
- * For anon: always drop the marker; for file: only
- * drop the marker if explicitly requested.
- */
- if (!vma_is_anonymous(vma) &&
- !zap_drop_markers(details))
- continue;
- } else if (is_guard_swp_entry(entry)) {
- /*
- * Ordinary zapping should not remove guard PTE
- * markers. Only do so if we should remove PTE markers
- * in general.
- */
- if (!zap_drop_markers(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
- WARN_ON_ONCE(1);
+ nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
+ &force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
+ break;
}
- clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1713,6 +1779,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush)
tlb_flush_mmu(tlb);
+ if (addr != end) {
+ cond_resched();
+ force_flush = false;
+ force_break = false;
+ goto retry;
+ }
+
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
@@ -1935,7 +2015,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
@@ -2971,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -3013,7 +3094,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
@@ -4189,8 +4269,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
return folio;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
}
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -4267,10 +4349,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -5102,7 +5189,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5141,7 +5232,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5177,9 +5269,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5625,7 +5717,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ignore_writable = true;
/* Migrate to the requested node */
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
task_numa_fault(last_cpupid, nid, nr_pages, flags);
@@ -6069,7 +6161,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * By the time we get here, we already hold either the VMA lock or the
+ * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
@@ -6185,7 +6278,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r
/*
* Helper for page fault handling.
*
- * This is kind of equivalend to "mmap_read_lock()" followed
+ * This is kind of equivalent to "mmap_read_lock()" followed
* by "find_extend_vma()", except it's a lot more careful about
* the locking (and will drop the lock on failure).
*
@@ -6746,10 +6839,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
@@ -6961,7 +7052,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
+ if (ptdesc->ptl)
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif