diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 255 |
1 files changed, 172 insertions, 83 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..b5e784f34d98 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,6 +47,7 @@ #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> +#include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -226,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); - int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); - if (folio_test_large(folio)) - refs = folio_nr_pages(folio); - folio_put_refs(folio, refs); + folio_put_refs(folio, folio_nr_pages(folio)); } /** @@ -441,6 +439,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, EXPORT_SYMBOL(filemap_fdatawrite_range); /** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + +/** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * @@ -841,11 +857,10 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, index); - void *alloced_shadow = NULL; - int alloced_order = 0; + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; + unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); @@ -854,7 +869,6 @@ noinline int __filemap_add_folio(struct address_space *mapping, mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); - xas_set_order(&xas, index, folio_order(folio)); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); @@ -864,7 +878,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->index = xas.xa_index; for (;;) { - int order = -1, split_order = 0; + int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); @@ -882,21 +896,25 @@ noinline int __filemap_add_folio(struct address_space *mapping, order = xas_get_order(&xas); } - /* entry may have changed before we re-acquire the lock */ - if (alloced_order && (old != alloced_shadow || order != alloced_order)) { - xas_destroy(&xas); - alloced_order = 0; - } - if (old) { - if (order > 0 && order > folio_order(folio)) { + if (order > 0 && order > forder) { + unsigned int split_order = max(forder, + xas_try_split_min_order(order)); + /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); - if (!alloced_order) { - split_order = order; - goto unlock; + + while (order > forder) { + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, order); + if (xas_error(&xas)) + goto unlock; + order = split_order; + split_order = + max(xas_try_split_min_order( + split_order), + forder); } - xas_split(&xas, old, order); xas_reset(&xas); } if (shadowp) @@ -920,17 +938,6 @@ noinline int __filemap_add_folio(struct address_space *mapping, unlock: xas_unlock_irq(&xas); - /* split needed, alloc here and retry. */ - if (split_order) { - xas_split_alloc(&xas, old, split_order, gfp); - if (xas_error(&xas)) - goto error; - alloced_shadow = old; - alloced_order = split_order; - xas_reset(&xas); - continue; - } - if (!xas_nomem(&xas, gfp)) break; } @@ -1059,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1067,6 +1087,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1214,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { @@ -1360,7 +1378,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. @@ -1464,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) } /** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - -/** * folio_unlock - Unlock a locked folio. * @folio: The folio. * @@ -1590,6 +1589,27 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void folio_end_dropbehind_write(struct folio *folio) +{ + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + if (folio->mapping) + folio_unmap_invalidate(folio->mapping, folio, 0); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1600,6 +1620,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); */ void folio_end_writeback(struct folio *folio) { + bool folio_dropbehind = false; + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1621,9 +1643,14 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); + if (!folio_test_dirty(folio)) + folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); + + if (folio_dropbehind) + folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1946,6 +1973,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1956,8 +1985,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -1968,6 +2008,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -2450,18 +2493,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2477,7 +2524,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2485,7 +2532,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2506,6 +2554,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2515,7 +2565,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2530,20 +2579,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2584,6 +2634,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + if (folio_test_clear_dropbehind(folio)) + folio_unmap_invalidate(mapping, folio, 0); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2697,8 +2761,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(mapping, folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); @@ -2839,8 +2907,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2897,7 +2964,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -2957,7 +3024,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -4027,17 +4094,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4055,6 +4111,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4080,6 +4142,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; @@ -4376,6 +4448,20 @@ resched: } /* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + +/* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the @@ -4430,6 +4516,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + if (flags != 0) return -EINVAL; |