From 84429b675bcfd2a518ae167ee4661cdf7539aa7d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 22 Aug 2024 15:50:09 +0200 Subject: fs: Allow fine-grained control of folio sizes We need filesystems to be able to communicate acceptable folio sizes to the pagecache for a variety of uses (e.g. large block sizes). Support a range of folio sizes between order-0 and order-31. Signed-off-by: Matthew Wilcox (Oracle) Co-developed-by: Pankaj Raghav Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Daniel Gomez Signed-off-by: Christian Brauner --- mm/filemap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index d62150418b91..ad5e4a848070 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1933,10 +1933,8 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - if (!mapping_large_folio_support(mapping)) - order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + if (order > mapping_max_folio_order(mapping)) + order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); -- cgit v1.2.3 From ab95d23bab220ef845c0d422f49452a475330eaf Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:10 +0200 Subject: filemap: allocate mapping_min_order folios in the page cache filemap_create_folio() and do_read_cache_folio() were always allocating folio of order 0. __filemap_get_folio was trying to allocate higher order folios when fgp_flags had higher order hint set but it will default to order 0 folio if higher order memory allocation fails. Supporting mapping_min_order implies that we guarantee each folio in the page cache has at least an order of mapping_min_order. When adding new folios to the page cache we must also ensure the index used is aligned to the mapping_min_order as the page cache requires the index to be aligned to the order of the folio. Co-developed-by: Luis Chamberlain Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-3-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Daniel Gomez Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 20 ++++++++++++++++++++ mm/filemap.c | 24 ++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c60025bb584c..4cc170949e9c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -448,6 +448,26 @@ mapping_min_folio_order(const struct address_space *mapping) return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } +static inline unsigned long +mapping_min_folio_nrpages(struct address_space *mapping) +{ + return 1UL << mapping_min_folio_order(mapping); +} + +/** + * mapping_align_index() - Align index for this mapping. + * @mapping: The address_space. + * + * The index of a folio must be naturally aligned. If you are adding a + * new folio to the page cache and need to know what index to give it, + * call this function. + */ +static inline pgoff_t mapping_align_index(struct address_space *mapping, + pgoff_t index) +{ + return round_down(index, mapping_min_folio_nrpages(mapping)); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. diff --git a/mm/filemap.c b/mm/filemap.c index ad5e4a848070..d27e9ac54309 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -859,6 +859,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), + folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); @@ -1919,8 +1921,10 @@ repeat: folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { - unsigned order = FGF_GET_ORDER(fgp_flags); + unsigned int min_order = mapping_min_folio_order(mapping); + unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; + index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; @@ -1943,7 +1947,7 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order > 0) + if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) @@ -1958,7 +1962,7 @@ no_page: break; folio_put(folio); folio = NULL; - } while (order-- > 0); + } while (order-- > min_order); if (err == -EEXIST) goto repeat; @@ -2447,13 +2451,15 @@ unlock_mapping: } static int filemap_create_folio(struct file *file, - struct address_space *mapping, pgoff_t index, + struct address_space *mapping, loff_t pos, struct folio_batch *fbatch) { struct folio *folio; int error; + unsigned int min_order = mapping_min_folio_order(mapping); + pgoff_t index; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; @@ -2471,6 +2477,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); + index = (pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2531,8 +2538,7 @@ retry: if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_folio(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, fbatch); + err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -3748,9 +3754,11 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, 0); + folio = filemap_alloc_folio(gfp, + mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); + index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); -- cgit v1.2.3 From 743a2753a02e805347969f6f89f38b736850d808 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:13 +0200 Subject: filemap: cap PTE range to be created to allowed zero fill in folio_map_range() Usually the page cache does not extend beyond the size of the inode, therefore, no PTEs are created for folios that extend beyond the size. But with LBS support, we might extend page cache beyond the size of the inode as we need to guarantee folios of minimum order. While doing a read, do_fault_around() can create PTEs for pages that lie beyond the EOF leading to incorrect error return when accessing a page beyond the mapped file. Cap the PTE range to be created for the page cache up to the end of file(EOF) in filemap_map_pages() so that return error codes are consistent with POSIX[1] for LBS configurations. generic/749 has been created to trigger this edge case. This also fixes generic/749 for tmpfs with huge=always on systems with 4k base page size. [1](from mmap(2)) SIGBUS Attempted access to a page of the buffer that lies beyond the end of the mapped file. For an explanation of the treatment of the bytes in the page that corresponds to the end of a mapped file that is not a multiple of the page size, see NOTES. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-6-kernel@pankajraghav.com Tested-by: David Howells Acked-by: Darrick J. Wong Reviewed-by: Hannes Reinecke Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Reviewed-by: Daniel Gomez Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- mm/filemap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index d27e9ac54309..d32210927453 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3608,7 +3608,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - pgoff_t last_pgoff = start_pgoff; + pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; @@ -3634,6 +3634,10 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + if (end_pgoff > file_end) + end_pgoff = file_end; + folio_type = mm_counter_file(folio); do { unsigned long end; -- cgit v1.2.3