diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 214 |
1 files changed, 112 insertions, 102 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 4ea6109a8043..99327c30507c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ @@ -526,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 @@ -591,6 +590,28 @@ shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t w return order > 0 ? BIT(order + 1) - 1 : 0; } +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) +{ + pgoff_t aligned_index; + unsigned long order; + loff_t i_size; + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, @@ -599,9 +620,6 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; - unsigned int order; - pgoff_t aligned_index; - loff_t i_size; if (!S_ISREG(inode->i_mode)) return 0; @@ -635,16 +653,11 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index within_size_orders = shmem_mapping_size_orders(inode->i_mapping, index, write_end); - order = highest_order(within_size_orders); - while (within_size_orders) { - aligned_index = round_up(index + 1, 1 << order); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= aligned_index) - return within_size_orders; + within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, + index, write_end); + if (within_size_orders > 0) + return within_size_orders; - order = next_order(&within_size_orders, order); - } fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) @@ -1380,9 +1393,9 @@ static void shmem_evict_inode(struct inode *inode) #endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1415,7 +1428,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1462,8 +1475,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } @@ -1533,7 +1546,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - swp_entry_t swap; pgoff_t index; int nr_pages; bool split = false; @@ -1548,7 +1560,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -1615,14 +1627,6 @@ try_split: folio_mark_uptodate(folio); } - swap = folio_alloc_swap(folio); - if (!swap.val) { - if (nr_pages > 1) - goto try_split; - - goto redirty; - } - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is @@ -1635,20 +1639,20 @@ try_split: if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(folio, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, - NULL) == 0) { + if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { shmem_recalc_inode(inode, 0, nr_pages); - swap_shmem_alloc(swap, nr_pages); - shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); return swap_writepage(&folio->page, wbc); } + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); - put_swap_folio(folio, swap); + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) @@ -1757,10 +1761,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; - pgoff_t aligned_index; unsigned int global_orders; - loff_t i_size; - int order; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; @@ -1786,17 +1787,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ - order = highest_order(within_size_orders); - while (within_size_orders) { - aligned_index = round_up(index + 1, 1 << order); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= aligned_index) { - mask |= within_size_orders; - break; - } - - order = next_order(&within_size_orders, order); - } + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); @@ -2017,7 +2008,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, __folio_set_swapbacked(new); new->swap = entry; - mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(new, shadow); @@ -2162,15 +2153,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); - void *alloced_shadow = NULL; - int alloced_order = 0, i; + int split_order = 0, entry_order; + int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { - int order = -1, split_order = 0; void *old = NULL; + int cur_order; + pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); @@ -2179,60 +2171,56 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, goto unlock; } - order = xas_get_order(&xas); + entry_order = xas_get_order(&xas); - /* Swap entry may have changed before we re-acquire the lock */ - if (alloced_order && - (old != alloced_shadow || order != alloced_order)) { - xas_destroy(&xas); - alloced_order = 0; - } + if (!entry_order) + goto unlock; /* Try to split large swap entry in pagecache */ - if (order > 0) { - if (!alloced_order) { - split_order = order; + cur_order = entry_order; + swap_index = round_down(index, 1 << entry_order); + + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) goto unlock; - } - xas_split(&xas, old, order); /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ - for (i = 0; i < 1 << order; i++) { - pgoff_t aligned_index = round_down(index, 1 << order); + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { swp_entry_t tmp; - tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); - /* split needed, alloc here and retry. */ - if (split_order) { - xas_split_alloc(&xas, old, split_order, gfp); - if (xas_error(&xas)) - goto error; - alloced_shadow = old; - alloced_order = split_order; - xas_reset(&xas); - continue; - } - if (!xas_nomem(&xas, gfp)) break; } -error: if (xas_error(&xas)) return xas_error(&xas); - return alloced_order; + return entry_order; } /* @@ -2253,7 +2241,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio *folio = NULL; bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2272,10 +2260,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int order = xa_get_order(&mapping->i_pages, index); bool fallback_order0 = false; - int split_order; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2339,6 +2326,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } alloced: @@ -2346,7 +2356,8 @@ alloced: folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -3279,8 +3290,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, if (ret) return ret; - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { + if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; @@ -3487,7 +3497,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3514,7 +3524,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3601,7 +3611,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3889,16 +3899,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, @@ -5651,19 +5661,19 @@ static int __init setup_thp_shmem(char *str) THP_ORDERS_ALL_FILE_DEFAULT); } - if (start == -EINVAL) { + if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } - if (end == -EINVAL) { + if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } - if (start < 0 || end < 0 || start > end) + if (start > end) goto err; nr = end - start + 1; @@ -5830,7 +5840,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -5842,7 +5852,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -5855,7 +5865,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ |