summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c523
1 files changed, 417 insertions, 106 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fb7b8dc75167..2d8c265fc7d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -94,14 +94,6 @@
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NUMA
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
-
-struct page *mem_map;
-EXPORT_SYMBOL(mem_map);
-#endif
-
static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);
@@ -121,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
}
/*
- * A number of key systems in x86 including ioremap() rely on the assumption
- * that high_memory defines the upper bound on direct map memory, then end
- * of ZONE_NORMAL.
- */
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-
-/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
@@ -715,42 +699,53 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
}
#endif
+/**
+ * restore_exclusive_pte - Restore a device-exclusive entry
+ * @vma: VMA covering @address
+ * @folio: the mapped folio
+ * @page: the mapped folio page
+ * @address: the virtual address
+ * @ptep: pte pointer into the locked page table mapping the folio page
+ * @orig_pte: pte value at @ptep
+ *
+ * Restore a device-exclusive non-swap entry to an ordinary present pte.
+ *
+ * The folio and the page table must be locked, and MMU notifiers must have
+ * been called to invalidate any (exclusive) device mappings.
+ *
+ * Locking the folio makes sure that anybody who just converted the pte to
+ * a device-exclusive entry can map it into the device to make forward
+ * progress without others converting it back until the folio was unlocked.
+ *
+ * If the folio lock ever becomes an issue, we can stop relying on the folio
+ * lock; it might make some scenarios with heavy thrashing less likely to
+ * make forward progress, but these scenarios might not be valid use cases.
+ *
+ * Note that the folio lock does not protect against all cases of concurrent
+ * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
+ * must use MMU notifiers to sync against any concurrent changes.
+ */
static void restore_exclusive_pte(struct vm_area_struct *vma,
- struct page *page, unsigned long address,
- pte_t *ptep)
+ struct folio *folio, struct page *page, unsigned long address,
+ pte_t *ptep, pte_t orig_pte)
{
- struct folio *folio = page_folio(page);
- pte_t orig_pte;
pte_t pte;
- swp_entry_t entry;
- orig_pte = ptep_get(ptep);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(orig_pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
- else if (is_writable_device_exclusive_entry(entry))
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-
- VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
- PageAnonExclusive(page)), folio);
-
- /*
- * No need to take a page reference as one was already
- * created when the swap entry was made.
- */
- if (folio_test_anon(folio))
- folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
- else
- /*
- * Currently device exclusive access only supports anonymous
- * memory so the entry shouldn't point to a filebacked page.
- */
- WARN_ON_ONCE(1);
+ if ((vma->vm_flags & VM_WRITE) &&
+ can_change_pte_writable(vma, address, pte)) {
+ if (folio_test_dirty(folio))
+ pte = pte_mkdirty(pte);
+ pte = pte_mkwrite(pte, vma);
+ }
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
@@ -764,16 +759,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
* Tries to restore an exclusive pte if the page lock can be acquired without
* sleeping.
*/
-static int
-try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr)
+static int try_restore_exclusive_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ struct folio *folio = page_folio(page);
- if (trylock_page(page)) {
- restore_exclusive_pte(vma, page, addr, src_pte);
- unlock_page(page);
+ if (folio_trylock(folio)) {
+ restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
+ folio_unlock(folio);
return 0;
}
@@ -853,7 +847,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
folio_get(folio);
rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
- folio_try_dup_anon_rmap_pte(folio, page, src_vma);
+ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
/*
* We do not preserve soft-dirty information, because so
@@ -879,7 +873,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* (ie. COW) mappings.
*/
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
- if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
@@ -1007,14 +1001,14 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
- nr, src_vma))) {
+ nr, dst_vma, src_vma))) {
folio_ref_sub(folio, nr);
return -EAGAIN;
}
rss[MM_ANONPAGES] += nr;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_ptes(folio, page, nr);
+ folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
if (any_writable)
@@ -1032,7 +1026,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
@@ -1042,7 +1036,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_pte(folio, page);
+ folio_dup_file_rmap_pte(folio, page, dst_vma);
rss[mm_counter_file(folio)]++;
}
@@ -1362,12 +1356,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next, pfn;
bool is_cow;
int ret;
@@ -1378,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
+ ret = track_pfn_copy(dst_vma, src_vma, &pfn);
if (ret)
return ret;
}
@@ -1419,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1429,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
+ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1622,8 +1613,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
*/
WARN_ON_ONCE(!vma_is_anonymous(vma));
rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
+ folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
} else if (!non_swap_entry(entry)) {
/* Genuine swap entries, hence a private anon pages */
@@ -2135,19 +2125,39 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
}
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
- unsigned long addr, struct page *page, pgprot_t prot)
+ unsigned long addr, struct page *page,
+ pgprot_t prot, bool mkwrite)
{
struct folio *folio = page_folio(page);
- pte_t pteval;
+ pte_t pteval = ptep_get(pte);
+
+ if (!pte_none(pteval)) {
+ if (!mkwrite)
+ return -EBUSY;
+
+ /* see insert_pfn(). */
+ if (pte_pfn(pteval) != page_to_pfn(page)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
+ return -EFAULT;
+ }
+ pteval = maybe_mkwrite(pteval, vma);
+ pteval = pte_mkyoung(pteval);
+ if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
+ update_mmu_cache(vma, addr, pte);
+ return 0;
+ }
- if (!pte_none(ptep_get(pte)))
- return -EBUSY;
/* Ok, finally just insert the thing.. */
pteval = mk_pte(page, prot);
if (unlikely(is_zero_folio(folio))) {
pteval = pte_mkspecial(pteval);
} else {
folio_get(folio);
+ pteval = mk_pte(page, prot);
+ if (mkwrite) {
+ pteval = pte_mkyoung(pteval);
+ pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
+ }
inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
}
@@ -2156,7 +2166,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
}
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, pgprot_t prot)
+ struct page *page, pgprot_t prot, bool mkwrite)
{
int retval;
pte_t *pte;
@@ -2169,7 +2179,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
+ mkwrite);
pte_unmap_unlock(pte, ptl);
out:
return retval;
@@ -2183,7 +2194,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
err = validate_page_before_insert(vma, page);
if (err)
return err;
- return insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -2319,7 +2330,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
BUG_ON(vma->vm_flags & VM_PFNMAP);
vm_flags_set(vma, VM_MIXEDMAP);
}
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2599,7 +2610,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
@@ -2612,6 +2623,26 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write)
+{
+ pgprot_t pgprot = vmf->vma->vm_page_prot;
+ unsigned long addr = vmf->address;
+ int err;
+
+ if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ err = insert_page(vmf->vma, addr, page, pgprot, write);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
@@ -3676,19 +3707,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
-static bool wp_can_reuse_anon_folio(struct folio *folio,
- struct vm_area_struct *vma)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
{
+ bool exclusive = false;
+
+ /* Let's just free up a large folio if only a single page is mapped. */
+ if (folio_large_mapcount(folio) <= 1)
+ return false;
+
/*
- * We could currently only reuse a subpage of a large folio if no
- * other subpages of the large folios are still mapped. However,
- * let's just consistently not reuse subpages even if we could
- * reuse in that scenario, and give back a large folio a bit
- * sooner.
+ * The assumption for anonymous folios is that each page can only get
+ * mapped once into each MM. The only exception are KSM folios, which
+ * are always small.
+ *
+ * Each taken mapcount must be paired with exactly one taken reference,
+ * whereby the refcount must be incremented before the mapcount when
+ * mapping a page, and the refcount must be decremented after the
+ * mapcount when unmapping a page.
+ *
+ * If all folio references are from mappings, and all mappings are in
+ * the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large(folio))
+ if (folio_test_large_maybe_mapped_shared(folio))
+ return false;
+
+ VM_WARN_ON_ONCE(folio_test_ksm(folio));
+ VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
+ VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
+
+ if (unlikely(folio_test_swapcache(folio))) {
+ /*
+ * Note: freeing up the swapcache will fail if some PTEs are
+ * still swap entries.
+ */
+ if (!folio_trylock(folio))
+ return false;
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
return false;
+ /* Stabilize the mapcount vs. refcount and recheck. */
+ folio_lock_large_mapcount(folio);
+ VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+
+ if (folio_test_large_maybe_mapped_shared(folio))
+ goto unlock;
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
+ goto unlock;
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
+ folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
+
+ /*
+ * Do we need the folio lock? Likely not. If there would have been
+ * references from page migration/swapout, we would have detected
+ * an additional folio reference and never ended up here.
+ */
+ exclusive = true;
+unlock:
+ folio_unlock_large_mapcount(folio);
+ return exclusive;
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
+ return __wp_can_reuse_large_anon_folio(folio, vma);
+
/*
* We have to verify under folio lock: these early checks are
* just an optimization to avoid locking the folio and freeing
@@ -3797,13 +3895,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
+ * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if (!vmf->page)
+ if (!vmf->page || is_fsdax_page(vmf->page)) {
+ vmf->page = NULL;
return wp_pfn_shared(vmf);
+ }
return wp_page_shared(vmf, folio);
}
@@ -3993,7 +4093,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
folio_put(folio);
return ret;
}
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
@@ -4001,7 +4101,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
- restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
+ vmf->pte, vmf->orig_pte);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4349,10 +4450,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ struct dev_pagemap *pgmap;
+
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -4406,7 +4515,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
need_clear_cache = true;
- mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -5575,7 +5684,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
*flags |= TNF_SHARED;
/*
* For memory tiering mode, cpupid of slow memory page is used
@@ -6346,6 +6455,88 @@ fail:
#endif
#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
@@ -6363,15 +6554,17 @@ retry:
if (!vma)
goto inval;
- if (!vma_start_read(vma))
- goto inval;
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
- /* Check if the VMA got isolated after we found it */
- if (vma->detached) {
- vma_end_read(vma);
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
+ /* Failed to lock the VMA */
+ goto inval;
}
/*
* At this point, we have a stable reference to a VMA: The VMA is
@@ -6380,8 +6573,9 @@ retry:
* fields are accessible for RCU readers.
*/
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read;
rcu_read_unlock();
@@ -6476,6 +6670,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
args->lock = lock;
args->ptep = ptep;
args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->addr_mask = addr_mask;
args->pgprot = pgprot;
args->writable = writable;
args->special = special;
@@ -6635,7 +6830,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
- unsigned long prot = 0;
+ pgprot_t prot = __pgprot(0);
void __iomem *maddr;
int offset = offset_in_page(addr);
int ret = -EINVAL;
@@ -6645,7 +6840,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pfnmap_start(&args))
return -EINVAL;
- prot = pgprot_val(args.pgprot);
+ prot = args.pgprot;
phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
writable = args.writable;
follow_pfnmap_end(&args);
@@ -6660,7 +6855,7 @@ retry:
if (follow_pfnmap_start(&args))
goto out_unmap;
- if ((prot != pgprot_val(args.pgprot)) ||
+ if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
(phys_addr != (args.pfn << PAGE_SHIFT)) ||
(writable != args.writable)) {
follow_pfnmap_end(&args);
@@ -6802,6 +6997,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ void *old_buf = buf;
+ int err = 0;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return -EFAULT;
+
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ while (len) {
+ int bytes, offset, retval;
+ void *maddr;
+ struct page *page;
+ struct vm_area_struct *vma = NULL;
+
+ page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
+ if (IS_ERR(page)) {
+ /*
+ * Treat as a total failure for now until we decide how
+ * to handle the CONFIG_HAVE_IOREMAP_PROT case and
+ * stack expansion.
+ */
+ *(char *)buf = '\0';
+ err = -EFAULT;
+ goto out;
+ }
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE - 1);
+ if (bytes > PAGE_SIZE - offset)
+ bytes = PAGE_SIZE - offset;
+
+ maddr = kmap_local_page(page);
+ retval = strscpy(buf, maddr + offset, bytes);
+ if (retval >= 0) {
+ /* Found the end of the string */
+ buf += retval;
+ unmap_and_put_page(page, maddr);
+ break;
+ }
+
+ buf += bytes - 1;
+ /*
+ * Because strscpy always NUL terminates we need to
+ * copy the last byte in the page if we are going to
+ * load more pages
+ */
+ if (bytes != len) {
+ addr += bytes - 1;
+ copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
+ buf += 1;
+ addr += 1;
+ }
+ len -= bytes;
+
+ unmap_and_put_page(page, maddr);
+ }
+
+out:
+ mmap_read_unlock(mm);
+ if (err)
+ return err;
+ return buf - old_buf;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/*
* Print the name of a VMA.
*/
@@ -6834,10 +7147,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif