summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c1019
1 files changed, 694 insertions, 325 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d3ebdc002d5..2a47682d1ab7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1309,8 +1309,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return;
entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
@@ -1375,20 +1373,20 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
- spinlock_t *ptl;
- ptl = pmd_lock(mm, pmd);
+ lockdep_assert_held(pmd_lockptr(mm, pmd));
+
if (!pmd_none(*pmd)) {
if (write) {
if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1396,7 +1394,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
@@ -1412,16 +1410,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
- pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
- spin_unlock(ptl);
- if (pgtable)
- pte_free(mm, pgtable);
+ return 0;
}
/**
@@ -1440,6 +1433,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
+ int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1462,12 +1457,56 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
}
track_pfn_insert(vma, &pgprot, pfn);
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
+ pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(vma->vm_mm, pgtable);
- insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pgtable_t pgtable = NULL;
+ int error;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(mm, vmf->pmd);
+ if (pmd_none(*vmf->pmd)) {
+ folio_get(folio);
+ folio_add_file_rmap_pmd(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+ }
+ error = insert_pfn_pmd(vma, addr, vmf->pmd,
+ pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
+ write, pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(mm, pgtable);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
@@ -1482,19 +1521,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
pgprot_t prot = vma->vm_page_prot;
pud_t entry;
- spinlock_t *ptl;
- ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- goto out_unlock;
+ return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- goto out_unlock;
+ return;
}
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
@@ -1508,9 +1545,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
-
-out_unlock:
- spin_unlock(ptl);
}
/**
@@ -1528,6 +1562,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1545,10 +1580,57 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
track_pfn_insert(vma, &pgprot, pfn);
+ ptl = pud_lock(vma->vm_mm, vmf->pud);
insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ spin_unlock(ptl);
+
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @folio: folio to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pud_t *pud = vmf->pud;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
+
+ /*
+ * If there is already an entry present we assume the folio is
+ * already mapped, hence no need to take another reference. We
+ * still call insert_pfn_pud() though in case the mapping needs
+ * upgrading to writeable.
+ */
+ if (pud_none(*vmf->pud)) {
+ folio_get(folio);
+ folio_add_file_rmap_pud(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+ }
+ insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
+ write);
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1698,7 +1780,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_folio = page_folio(src_page);
folio_get(src_folio);
- if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
/* Page maybe pinned: split and retry the fault on PTEs. */
folio_put(src_folio);
pte_free(dst_mm, pgtable);
@@ -2071,7 +2153,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -2141,12 +2223,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2646,12 +2729,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
+ !pud_present(orig_pud));
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2659,6 +2754,10 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ struct folio *folio;
+ struct page *page;
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2666,7 +2765,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+
+ if (!vma_is_dax(vma))
+ return;
+
+ page = pud_page(old_pud);
+ folio = page_folio(page);
+
+ if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+ folio_mark_dirty(folio);
+ if (!folio_test_referenced(folio) && pud_young(old_pud))
+ folio_set_referenced(folio);
+ folio_remove_rmap_pud(folio, page, vma);
+ folio_put(folio);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio),
+ -HPAGE_PUD_NR);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2766,13 +2880,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
folio = pfn_swap_entry_folio(entry);
+ } else if (is_huge_zero_pmd(old_pmd)) {
+ return;
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -3017,9 +3133,9 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
+ unsigned long start,
+ unsigned long end,
+ struct vm_area_struct *next)
{
/* Check if we need to split start first. */
split_huge_pmd_if_needed(vma, start);
@@ -3027,16 +3143,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/* Check if we need to split end next. */
split_huge_pmd_if_needed(vma, end);
- /*
- * If we're also updating the next vma vm_start,
- * check if we need to split it.
- */
- if (adjust_next > 0) {
- struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
- unsigned long nstart = next->vm_start;
- nstart += adjust_next;
- split_huge_pmd_if_needed(next, nstart);
- }
+ /* If we're incrementing next->vm_start, we might need to split it. */
+ if (next)
+ split_huge_pmd_if_needed(next, end);
}
static void unmap_folio(struct folio *folio)
@@ -3070,8 +3179,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
int ref_count, map_count;
pmd_t orig_pmd = *pmdp;
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
return false;
+ }
orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
@@ -3098,8 +3211,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
*
* The only folio refs must be one from isolation plus the rmap(s).
*/
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
- ref_count != map_count + 1) {
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ if (ref_count != map_count + 1) {
set_pmd_at(mm, addr, pmdp, orig_pmd);
return false;
}
@@ -3119,12 +3239,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
{
VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
- if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
- return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
-
- return false;
+ return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}
static void remap_page(struct folio *folio, unsigned long nr, int flags)
@@ -3143,225 +3262,378 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags)
}
}
-static void lru_add_page_tail(struct folio *folio, struct page *tail,
+static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
struct lruvec *lruvec, struct list_head *list)
{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- VM_BUG_ON_FOLIO(PageLRU(tail), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
- get_page(tail);
- list_add_tail(&tail->lru, list);
+ folio_get(new_folio);
+ list_add_tail(&new_folio->lru, list);
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!folio_test_lru(folio));
if (folio_test_unevictable(folio))
- tail->mlock_count = 0;
+ new_folio->mlock_count = 0;
else
- list_add_tail(&tail->lru, &folio->lru);
- SetPageLRU(tail);
+ list_add_tail(&new_folio->lru, &folio->lru);
+ folio_set_lru(new_folio);
}
}
-static void __split_huge_page_tail(struct folio *folio, int tail,
- struct lruvec *lruvec, struct list_head *list,
- unsigned int new_order)
+/* Racy check whether the huge page can be split */
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
- struct page *head = &folio->page;
- struct page *page_tail = head + tail;
- /*
- * Careful: new_folio is not a "real" folio before we cleared PageTail.
- * Don't pass it around before clear_compound_head().
- */
- struct folio *new_folio = (struct folio *)page_tail;
+ int extra_pins;
+
+ /* Additional pins from page cache */
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
+ else
+ extra_pins = folio_nr_pages(folio);
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
+ caller_pins;
+}
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+/*
+ * It splits @folio into @new_order folios and copies the @folio metadata to
+ * all the resulting folios.
+ */
+static void __split_folio_to_order(struct folio *folio, int old_order,
+ int new_order)
+{
+ long new_nr_pages = 1 << new_order;
+ long nr_pages = 1 << old_order;
+ long i;
/*
- * Clone page flags before unfreezing refcount.
- *
- * After successful get_page_unless_zero() might follow flags change,
- * for example lock_page() which set PG_waiters.
- *
- * Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
- * the migration entry instead from where remap_page() will restore it.
- * We can still have PG_anon_exclusive set on effectively unmapped and
- * unreferenced sub-pages of an anonymous THP: we can simply drop
- * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ * Skip the first new_nr_pages, since the new folio from them have all
+ * the flags from the original folio.
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (head->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_swapcache) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_workingset) |
- (1L << PG_locked) |
- (1L << PG_unevictable) |
+ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
+ struct page *new_head = &folio->page + i;
+
+ /*
+ * Careful: new_folio is not a "real" folio before we cleared PageTail.
+ * Don't pass it around before clear_compound_head().
+ */
+ struct folio *new_folio = (struct folio *)new_head;
+
+ VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
+
+ /*
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ */
+ new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags |= (folio->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_workingset) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
- (1L << PG_arch_2) |
+ (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
- (1L << PG_arch_3) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty) |
- LRU_GEN_MASK | LRU_REFS_MASK));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first and second tail page is replaced by other uses */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- new_folio->mapping = folio->mapping;
- new_folio->index = folio->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + i;
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(page_tail->private)) {
- VM_WARN_ON_ONCE_PAGE(true, page_tail);
- page_tail->private = 0;
- }
- if (folio_test_swapcache(folio))
- new_folio->swap.val = folio->swap.val + tail;
+ /*
+ * page->private should not be set in tail pages. Fix up and warn once
+ * if private is unexpectedly set.
+ */
+ if (unlikely(new_folio->private)) {
+ VM_WARN_ON_ONCE_PAGE(true, new_head);
+ new_folio->private = NULL;
+ }
- /* Page flags must be visible before we make the page non-compound. */
- smp_wmb();
+ if (folio_test_swapcache(folio))
+ new_folio->swap.val = folio->swap.val + i;
- /*
- * Clear PageTail before unfreezing page refcount.
- *
- * After successful get_page_unless_zero() might follow put_page()
- * which needs correct compound_head().
- */
- clear_compound_head(page_tail);
- if (new_order) {
- prep_compound_page(page_tail, new_order);
- folio_set_large_rmappable(new_folio);
- }
+ /* Page flags must be visible before we make the page non-compound. */
+ smp_wmb();
- /* Finally unfreeze refcount. Additional reference from page cache. */
- page_ref_unfreeze(page_tail,
- 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
- folio_nr_pages(new_folio) : 0));
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
+ clear_compound_head(new_head);
+ if (new_order) {
+ prep_compound_page(new_head, new_order);
+ folio_set_large_rmappable(new_folio);
+ }
- if (folio_test_young(folio))
- folio_set_young(new_folio);
- if (folio_test_idle(folio))
- folio_set_idle(new_folio);
+ if (folio_test_young(folio))
+ folio_set_young(new_folio);
+ if (folio_test_idle(folio))
+ folio_set_idle(new_folio);
+#ifdef CONFIG_MEMCG
+ new_folio->memcg_data = folio->memcg_data;
+#endif
- folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ }
- /*
- * always add to the tail because some iterators expect new
- * pages to show after the currently processed elements - e.g.
- * migrate_pages
- */
- lru_add_page_tail(folio, page_tail, lruvec, list);
+ if (new_order)
+ folio_set_order(folio, new_order);
+ else
+ ClearPageCompound(&folio->page);
}
-static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned int new_order)
+/*
+ * It splits an unmapped @folio to lower order smaller folios in two ways.
+ * @folio: the to-be-split folio
+ * @new_order: the smallest order of the after split folios (since buddy
+ * allocator like split generates folios with orders from @folio's
+ * order - 1 to new_order).
+ * @split_at: in buddy allocator like split, the folio containing @split_at
+ * will be split until its order becomes @new_order.
+ * @lock_at: the folio containing @lock_at is left locked for caller.
+ * @list: the after split folios will be added to @list if it is not NULL,
+ * otherwise to LRU lists.
+ * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
+ * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
+ * @mapping: @folio->mapping
+ * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ *
+ *
+ * 1. uniform split: the given @folio into multiple @new_order small folios,
+ * where all small folios have the same order. This is done when
+ * uniform_split is true.
+ * 2. buddy allocator like (non-uniform) split: the given @folio is split into
+ * half and one of the half (containing the given page) is split into half
+ * until the given @page's order becomes @new_order. This is done when
+ * uniform_split is false.
+ *
+ * The high level flow for these two methods are:
+ * 1. uniform split: a single __split_folio_to_order() is called to split the
+ * @folio into @new_order, then we traverse all the resulting folios one by
+ * one in PFN ascending order and perform stats, unfreeze, adding to list,
+ * and file mapping index operations.
+ * 2. non-uniform split: in general, folio_order - @new_order calls to
+ * __split_folio_to_order() are made in a for loop to split the @folio
+ * to one lower order at a time. The resulting small folios are processed
+ * like what is done during the traversal in 1, except the one containing
+ * @page, which is split in next for loop.
+ *
+ * After splitting, the caller's folio reference will be transferred to the
+ * folio containing @page. The other folios may be freed if they are not mapped.
+ *
+ * In terms of locking, after splitting,
+ * 1. uniform split leaves @page (or the folio contains it) locked;
+ * 2. buddy allocator like (non-uniform) split leaves @folio locked.
+ *
+ *
+ * For !uniform_split, when -ENOMEM is returned, the original folio might be
+ * split. The caller needs to check the input folio.
+ */
+static int __split_unmapped_folio(struct folio *folio, int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, pgoff_t end,
+ struct xa_state *xas, struct address_space *mapping,
+ bool uniform_split)
{
- struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
- unsigned long offset = 0;
- int i, nr_dropped = 0;
- unsigned int new_nr = 1 << new_order;
+ struct folio *origin_folio = folio;
+ struct folio *next_folio = folio_next(folio);
+ struct folio *new_folio;
+ struct folio *next;
int order = folio_order(folio);
- unsigned int nr = 1 << order;
+ int split_order;
+ int start_order = uniform_split ? new_order : order - 1;
+ int nr_dropped = 0;
+ int ret = 0;
+ bool stop_split = false;
+
+ if (folio_test_swapcache(folio)) {
+ VM_BUG_ON(mapping);
- /* complete memcg works before add pages to LRU */
- split_page_memcg(head, order, new_order);
+ /* a swapcache folio can only be uniformly split to order-0 */
+ if (!uniform_split || new_order != 0)
+ return -EINVAL;
- if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
- offset = swap_cache_index(folio->swap);
swap_cache = swap_address_space(folio->swap);
xa_lock(&swap_cache->i_pages);
}
+ if (folio_test_anon(folio))
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = folio_lruvec_lock(folio);
folio_clear_has_hwpoisoned(folio);
- for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
- struct folio *tail;
- __split_huge_page_tail(folio, i, lruvec, list, new_order);
- tail = page_folio(head + i);
- /* Some pages can be beyond EOF: drop them from page cache */
- if (tail->index >= end) {
- if (shmem_mapping(folio->mapping))
- nr_dropped++;
- else if (folio_test_clear_dirty(tail))
- folio_account_cleaned(tail,
- inode_to_wb(folio->mapping->host));
- __filemap_remove_folio(tail, NULL);
- folio_put(tail);
- } else if (!folio_test_anon(folio)) {
- __xa_store(&folio->mapping->i_pages, tail->index,
- tail, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages, offset + i,
- tail, 0);
+ /*
+ * split to new_order one order at a time. For uniform split,
+ * folio is split to new_order directly.
+ */
+ for (split_order = start_order;
+ split_order >= new_order && !stop_split;
+ split_order--) {
+ int old_order = folio_order(folio);
+ struct folio *release;
+ struct folio *end_folio = folio_next(folio);
+
+ /* order-1 anonymous folio is not supported */
+ if (folio_test_anon(folio) && split_order == 1)
+ continue;
+ if (uniform_split && split_order != new_order)
+ continue;
+
+ if (mapping) {
+ /*
+ * uniform split has xas_split_alloc() called before
+ * irq is disabled to allocate enough memory, whereas
+ * non-uniform split can handle ENOMEM.
+ */
+ if (uniform_split)
+ xas_split(xas, folio, old_order);
+ else {
+ xas_set_order(xas, folio->index, split_order);
+ xas_try_split(xas, folio, old_order);
+ if (xas_error(xas)) {
+ ret = xas_error(xas);
+ stop_split = true;
+ goto after_split;
+ }
+ }
}
- }
- if (!new_order)
- ClearPageCompound(head);
- else {
- struct folio *new_folio = (struct folio *)head;
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- folio_set_order(new_folio, new_order);
- }
- unlock_page_lruvec(lruvec);
- /* Caller disabled irqs, so they are still disabled here */
+ __split_folio_to_order(folio, old_order, split_order);
- split_page_owner(head, order, new_order);
- pgalloc_tag_split(folio, order, new_order);
+after_split:
+ /*
+ * Iterate through after-split folios and perform related
+ * operations. But in buddy allocator like split, the folio
+ * containing the specified page is skipped until its order
+ * is new_order, since the folio will be worked on in next
+ * iteration.
+ */
+ for (release = folio; release != end_folio; release = next) {
+ next = folio_next(release);
+ /*
+ * for buddy allocator like split, the folio containing
+ * page will be split next and should not be released,
+ * until the folio's order is new_order or stop_split
+ * is set to true by the above xas_split() failure.
+ */
+ if (release == page_folio(split_at)) {
+ folio = release;
+ if (split_order != new_order && !stop_split)
+ continue;
+ }
+ if (folio_test_anon(release)) {
+ mod_mthp_stat(folio_order(release),
+ MTHP_STAT_NR_ANON, 1);
+ }
- /* See comment in __split_huge_page_tail() */
- if (folio_test_anon(folio)) {
- /* Additional pin to swap cache */
- if (folio_test_swapcache(folio)) {
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&swap_cache->i_pages);
- } else {
- folio_ref_inc(folio);
+ /*
+ * origin_folio should be kept frozon until page cache
+ * entries are updated with all the other after-split
+ * folios to prevent others seeing stale page cache
+ * entries.
+ */
+ if (release == origin_folio)
+ continue;
+
+ folio_ref_unfreeze(release, 1 +
+ ((mapping || swap_cache) ?
+ folio_nr_pages(release) : 0));
+
+ lru_add_split_folio(origin_folio, release, lruvec,
+ list);
+
+ /* Some pages can be beyond EOF: drop them from cache */
+ if (release->index >= end) {
+ if (shmem_mapping(mapping))
+ nr_dropped += folio_nr_pages(release);
+ else if (folio_test_clear_dirty(release))
+ folio_account_cleaned(release,
+ inode_to_wb(mapping->host));
+ __filemap_remove_folio(release, NULL);
+ folio_put_refs(release, folio_nr_pages(release));
+ } else if (mapping) {
+ __xa_store(&mapping->i_pages,
+ release->index, release, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages,
+ swap_cache_index(release->swap),
+ release, 0);
+ }
}
- } else {
- /* Additional pin to page cache */
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&folio->mapping->i_pages);
}
+
+ /*
+ * Unfreeze origin_folio only after all page cache entries, which used
+ * to point to it, have been updated with new folios. Otherwise,
+ * a parallel folio_try_get() can grab origin_folio and its caller can
+ * see stale page cache entries.
+ */
+ folio_ref_unfreeze(origin_folio, 1 +
+ ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));
+
+ unlock_page_lruvec(lruvec);
+
+ if (swap_cache)
+ xa_unlock(&swap_cache->i_pages);
+ if (mapping)
+ xa_unlock(&mapping->i_pages);
+
+ /* Caller disabled irqs, so they are still disabled here */
local_irq_enable();
if (nr_dropped)
- shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
+ shmem_uncharge(mapping->host, nr_dropped);
+
+ remap_page(origin_folio, 1 << order,
+ folio_test_anon(origin_folio) ?
+ RMP_USE_SHARED_ZEROPAGE : 0);
/*
- * set page to its compound_head when split to non order-0 pages, so
- * we can skip unlocking it below, since PG_locked is transferred to
- * the compound_head of the page and the caller will unlock it.
+ * At this point, folio should contain the specified page.
+ * For uniform split, it is left for caller to unlock.
+ * For buddy allocator like split, the first after-split folio is left
+ * for caller to unlock.
*/
- if (new_order)
- page = compound_head(page);
-
- for (i = 0; i < nr; i += new_nr) {
- struct page *subpage = head + i;
- struct folio *new_folio = page_folio(subpage);
- if (subpage == page)
+ for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
continue;
- folio_unlock(new_folio);
+ folio_unlock(new_folio);
/*
* Subpages may be freed if there wasn't any mapping
* like if add_to_swap() is running on a lru page that
@@ -3369,81 +3641,90 @@ static void __split_huge_page(struct page *page, struct list_head *list,
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- free_page_and_swap_cache(subpage);
+ free_page_and_swap_cache(&new_folio->page);
}
+ return ret;
}
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
{
- int extra_pins;
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ /*
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
+ */
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
- /* Additional pins from page cache */
- if (folio_test_anon(folio))
- extra_pins = folio_test_swapcache(folio) ?
- folio_nr_pages(folio) : 0;
- else
- extra_pins = folio_nr_pages(folio);
- if (pextra_pins)
- *pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
- caller_pins;
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
+}
+
+/* See comments in non_uniform_split_supported() */
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
+{
+ if (folio_test_anon(folio)) {
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
+ }
+
+ if (new_order && folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
}
/*
- * This function splits a large folio into smaller folios of order @new_order.
- * @page can point to any page of the large folio to split. The split operation
- * does not change the position of @page.
- *
- * Prerequisites:
- *
- * 1) The caller must hold a reference on the @page's owning folio, also known
- * as the large folio.
+ * __folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ * @lock_at: a page within @folio to be left locked to caller
+ * @list: after-split folios will be put on it if non NULL
+ * @uniform_split: perform uniform split or not (non-uniform split)
*
- * 2) The large folio must be locked.
+ * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
+ * It is in charge of checking whether the split is supported or not and
+ * preparing @folio for __split_unmapped_folio().
*
- * 3) The folio must not be pinned. Any unexpected folio references, including
- * GUP pins, will result in the folio not getting split; instead, the caller
- * will receive an -EAGAIN.
- *
- * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
- * supported for non-file-backed folios, because folio->_deferred_list, which
- * is used by partially mapped folios, is stored in subpage 2, but an order-1
- * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
- * since they do not use _deferred_list.
- *
- * After splitting, the caller's folio reference will be transferred to @page,
- * resulting in a raised refcount of @page after this call. The other pages may
- * be freed if they are not mapped.
- *
- * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
- *
- * Pages in @new_order will inherit the mapping, flags, and so on from the
- * huge page.
- *
- * Returns 0 if the huge page was split successfully.
- *
- * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
- * the folio was concurrently removed from the page cache.
- *
- * Returns -EBUSY when trying to split the huge zeropage, if the folio is
- * under writeback, if fs-specific folio metadata cannot currently be
- * released, or if some unexpected race happened (e.g., anon VMA disappeared,
- * truncation).
- *
- * Callers should ensure that the order respects the address space mapping
- * min-order if one is set for non-anonymous folios.
- *
- * Returns -EINVAL when trying to split to an order that is incompatible
- * with the folio. Splitting to order 0 is compatible with all folios.
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
- unsigned int new_order)
+static int __folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, bool uniform_split)
{
- struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
- /* reset xarray order to new order after split */
- XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
@@ -3455,38 +3736,17 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (folio != page_folio(split_at) || folio != page_folio(lock_at))
+ return -EINVAL;
+
if (new_order >= folio_order(folio))
return -EINVAL;
- if (is_anon) {
- /* order-1 is not supported for anonymous THP. */
- if (new_order == 1) {
- VM_WARN_ONCE(1, "Cannot split to order-1 folio");
- return -EINVAL;
- }
- } else if (new_order) {
- /* Split shmem folio to non-zero order not supported */
- if (shmem_mapping(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split shmem folio to non-0 order");
- return -EINVAL;
- }
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split file folio to non-0 order");
- return -EINVAL;
- }
- }
+ if (uniform_split && !uniform_split_supported(folio, new_order, true))
+ return -EINVAL;
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio) && new_order)
+ if (!uniform_split &&
+ !non_uniform_split_supported(folio, new_order, true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3522,6 +3782,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
mapping = folio->mapping;
/* Truncated ? */
+ /*
+ * TODO: add support for large shmem folio in swap cache.
+ * When shmem is in swap cache, mapping is NULL and
+ * folio_test_swapcache() is true.
+ */
if (!mapping) {
ret = -EBUSY;
goto out;
@@ -3543,21 +3808,24 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
goto out;
}
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
- if (xas_error(&xas)) {
- ret = xas_error(&xas);
- goto out;
+ if (uniform_split) {
+ xas_set_order(&xas, folio->index, new_order);
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
/*
- *__split_huge_page() may need to trim off pages beyond EOF:
- * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
- * which cannot be nested inside the page tree lock. So note
- * end now: i_size itself may be changed at any moment, but
- * folio lock is good enough to serialize the trimming.
+ *__split_unmapped_folio() may need to trim off pages beyond
+ * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
+ * seqlock, which cannot be nested inside the page tree lock.
+ * So note end now: i_size itself may be changed at any moment,
+ * but folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -3611,7 +3879,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping) {
int nr = folio_nr_pages(folio);
- xas_split(&xas, folio, folio_order(folio));
if (folio_test_pmd_mappable(folio) &&
new_order < HPAGE_PMD_ORDER) {
if (folio_test_swapbacked(folio)) {
@@ -3625,12 +3892,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
}
- if (is_anon) {
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
- }
- __split_huge_page(page, list, end, new_order);
- ret = 0;
+ ret = __split_unmapped_folio(folio, new_order,
+ split_at, lock_at, list, end, &xas, mapping,
+ uniform_split);
} else {
spin_unlock(&ds_queue->split_queue_lock);
fail:
@@ -3656,6 +3920,90 @@ out:
return ret;
}
+/*
+ * This function splits a large folio into smaller folios of order @new_order.
+ * @page can point to any page of the large folio to split. The split operation
+ * does not change the position of @page.
+ *
+ * Prerequisites:
+ *
+ * 1) The caller must hold a reference on the @page's owning folio, also known
+ * as the large folio.
+ *
+ * 2) The large folio must be locked.
+ *
+ * 3) The folio must not be pinned. Any unexpected folio references, including
+ * GUP pins, will result in the folio not getting split; instead, the caller
+ * will receive an -EAGAIN.
+ *
+ * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
+ * supported for non-file-backed folios, because folio->_deferred_list, which
+ * is used by partially mapped folios, is stored in subpage 2, but an order-1
+ * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
+ * since they do not use _deferred_list.
+ *
+ * After splitting, the caller's folio reference will be transferred to @page,
+ * resulting in a raised refcount of @page after this call. The other pages may
+ * be freed if they are not mapped.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Pages in @new_order will inherit the mapping, flags, and so on from the
+ * huge page.
+ *
+ * Returns 0 if the huge page was split successfully.
+ *
+ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
+ * the folio was concurrently removed from the page cache.
+ *
+ * Returns -EBUSY when trying to split the huge zeropage, if the folio is
+ * under writeback, if fs-specific folio metadata cannot currently be
+ * released, or if some unexpected race happened (e.g., anon VMA disappeared,
+ * truncation).
+ *
+ * Callers should ensure that the order respects the address space mapping
+ * min-order if one is set for non-anonymous folios.
+ *
+ * Returns -EINVAL when trying to split to an order that is incompatible
+ * with the folio. Splitting to order 0 is compatible with all folios.
+ */
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
+{
+ struct folio *folio = page_folio(page);
+
+ return __folio_split(folio, new_order, &folio->page, page, list, true);
+}
+
+/*
+ * folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ *
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
+ *
+ * It has the same prerequisites and returns as
+ * split_huge_page_to_list_to_order().
+ *
+ * Split a folio at @split_at to a new_order folio, leave the
+ * remaining subpages of the original folio as large as possible. For example,
+ * in the case of splitting an order-9 folio at its third order-3 subpages to
+ * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
+ * After the split, there will be a group of folios with different orders and
+ * the new folio containing @split_at is marked in bracket:
+ * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
+ *
+ * After split, folio is left locked for caller.
+ */
+int folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct list_head *list)
+{
+ return __folio_split(folio, new_order, split_at, &folio->page, list,
+ false);
+}
+
int min_order_for_split(struct folio *folio)
{
if (folio_test_anon(folio))
@@ -3740,7 +4088,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/*
* Exclude swapcache: originally to avoid a corrupt deferred split
- * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * queue. Nowadays that is fully prevented by memcg1_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
@@ -3975,7 +4323,8 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
- unsigned long vaddr_end, unsigned int new_order)
+ unsigned long vaddr_end, unsigned int new_order,
+ long in_folio_offset)
{
int ret = 0;
struct task_struct *task;
@@ -4059,8 +4408,16 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!folio_test_anon(folio) && folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 ||
+ in_folio_offset >= folio_nr_pages(folio)) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
@@ -4083,7 +4440,8 @@ out:
}
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
- pgoff_t off_end, unsigned int new_order)
+ pgoff_t off_end, unsigned int new_order,
+ long in_folio_offset)
{
struct filename *file;
struct file *candidate;
@@ -4132,8 +4490,15 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
folio_unlock(folio);
@@ -4166,6 +4531,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
int pid;
unsigned long vaddr_start, vaddr_end;
unsigned int new_order = 0;
+ long in_folio_offset = -1;
ret = mutex_lock_interruptible(&split_debug_mutex);
if (ret)
@@ -4194,30 +4560,33 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
goto out;
}
- ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start,
- &off_end, &new_order);
- if (ret != 2 && ret != 3) {
+ ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
+ &new_order, &in_folio_offset);
+ if (ret != 2 && ret != 3 && ret != 4) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
+ ret = split_huge_pages_in_file(file_path, off_start, off_end,
+ new_order, in_folio_offset);
if (!ret)
ret = input_len;
goto out;
}
- ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
+ &vaddr_end, &new_order, &in_folio_offset);
if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf);
goto out;
- } else if (ret != 3 && ret != 4) {
+ } else if (ret != 3 && ret != 4 && ret != 5) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
+ in_folio_offset);
if (!ret)
ret = strlen(input_buf);
out: