diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-26 12:41:27 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-26 12:41:27 -0800 |
commit | fb527fc1f36e252cd1f62a26be4906949e7708ff (patch) | |
tree | 3066b8c59d8c7b37805d9ee2f50ed6db00a8c397 | |
parent | ff2a7a064a69069554564f52b6a84fc8a8c7d688 (diff) | |
parent | d1dfb5f52ffc4a142d88da5c0ed0514f3602c4b8 (diff) | |
download | lwn-fb527fc1f36e252cd1f62a26be4906949e7708ff.tar.gz lwn-fb527fc1f36e252cd1f62a26be4906949e7708ff.zip |
Merge tag 'fuse-update-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse
Pull fuse updates from Miklos Szeredi:
- Add page -> folio conversions (Joanne Koong, Josef Bacik)
- Allow max size of fuse requests to be configurable with a sysctl
(Joanne Koong)
- Allow FOPEN_DIRECT_IO to take advantage of async code path (yangyun)
- Fix large kernel reads (like a module load) in virtio_fs (Hou Tao)
- Fix attribute inconsistency in case readdirplus (and plain lookup in
corner cases) is racing with inode eviction (Zhang Tianci)
- Fix a WARN_ON triggered by virtio_fs (Asahi Lina)
* tag 'fuse-update-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse: (30 commits)
virtiofs: dax: remove ->writepages() callback
fuse: check attributes staleness on fuse_iget()
fuse: remove pages for requests and exclusively use folios
fuse: convert direct io to use folios
mm/writeback: add folio_mark_dirty_lock()
fuse: convert writebacks to use folios
fuse: convert retrieves to use folios
fuse: convert ioctls to use folios
fuse: convert writes (non-writeback) to use folios
fuse: convert reads to use folios
fuse: convert readdir to use folios
fuse: convert readlink to use folios
fuse: convert cuse to use folios
fuse: add support in virtio for requests using folios
fuse: support folios in struct fuse_args_pages and fuse_copy_pages()
fuse: convert fuse_notify_store to use folios
fuse: convert fuse_retrieve to use folios
fuse: use the folio based vmstat helpers
fuse: convert fuse_writepage_need_send to take a folio
fuse: convert fuse_do_readpage to use folios
...
-rw-r--r-- | Documentation/admin-guide/sysctl/fs.rst | 10 | ||||
-rw-r--r-- | fs/fuse/Makefile | 1 | ||||
-rw-r--r-- | fs/fuse/cuse.c | 29 | ||||
-rw-r--r-- | fs/fuse/dax.c | 11 | ||||
-rw-r--r-- | fs/fuse/dev.c | 66 | ||||
-rw-r--r-- | fs/fuse/dir.c | 37 | ||||
-rw-r--r-- | fs/fuse/file.c | 449 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 68 | ||||
-rw-r--r-- | fs/fuse/inode.c | 67 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 35 | ||||
-rw-r--r-- | fs/fuse/readdir.c | 33 | ||||
-rw-r--r-- | fs/fuse/sysctl.c | 40 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 77 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | mm/folio-compat.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 22 |
16 files changed, 578 insertions, 374 deletions
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst index 30c61474dec5..f5ec6c9312e1 100644 --- a/Documentation/admin-guide/sysctl/fs.rst +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -337,3 +337,13 @@ Each "watch" costs roughly 90 bytes on a 32-bit kernel, and roughly 160 bytes on a 64-bit one. The current default value for ``max_user_watches`` is 4% of the available low memory, divided by the "watch" cost in bytes. + +5. /proc/sys/fs/fuse - Configuration options for FUSE filesystems +===================================================================== + +This directory contains the following configuration options for FUSE +filesystems: + +``/proc/sys/fs/fuse/max_pages_limit`` is a read/write file for +setting/getting the maximum number of pages that can be used for servicing +requests in FUSE. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index ce0ff7a9007b..2c372180d631 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -14,5 +14,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_SYSCTL) += sysctl.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 0b2da7b7e2ad..b39844d75a80 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -303,8 +303,8 @@ struct cuse_init_args { struct fuse_args_pages ap; struct cuse_init_in in; struct cuse_init_out out; - struct page *page; - struct fuse_page_desc desc; + struct folio *folio; + struct fuse_folio_desc desc; }; /** @@ -326,7 +326,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = &ia->out; - struct page *page = ap->pages[0]; + struct folio *folio = ap->folios[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; @@ -343,7 +343,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, + rc = cuse_parse_devinfo(folio_address(folio), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -411,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, kobject_uevent(&dev->kobj, KOBJ_ADD); out: kfree(ia); - __free_page(page); + folio_put(folio); return; err_cdev: @@ -429,7 +429,7 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct page *page; + struct folio *folio; struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -437,13 +437,14 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); rc = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto err; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (!ia) - goto err_free_page; + goto err_free_folio; ap = &ia->ap; ia->in.major = FUSE_KERNEL_VERSION; @@ -459,18 +460,18 @@ static int cuse_send_init(struct cuse_conn *cc) ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; ap->args.out_argvar = true; ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &ia->page; + ap->num_folios = 1; + ap->folios = &ia->folio; ap->descs = &ia->desc; - ia->page = page; + ia->folio = folio; ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); -err_free_page: - __free_page(page); +err_free_folio: + folio_put(folio); } err: return rc; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..9abbc2f2894f 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -774,16 +774,6 @@ out: return ret; } -static int fuse_dax_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - - return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); -} - static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, bool write) { @@ -1323,7 +1313,6 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) } static const struct address_space_operations fuse_dax_file_aops = { - .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0723c6344b20..27ccae63495d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1028,17 +1028,27 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - - for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { + for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); + struct page *orig, *pagep; + + orig = pagep = &ap->folios[i]->page; - err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); + err = fuse_copy_page(cs, &pagep, offset, count, zeroing); if (err) return err; nbytes -= count; + + /* + * fuse_copy_page may have moved a page from a pipe instead of + * copying into our given page, so update the folios if it was + * replaced. + */ + if (pagep != orig) + ap->folios[i] = page_folio(pagep); } return 0; } @@ -1654,24 +1664,25 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { + struct folio *folio; struct page *page; unsigned int this_num; - err = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) + folio = filemap_grab_folio(mapping, index); + err = PTR_ERR(folio); + if (IS_ERR(folio)) goto out_iput; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + page = &folio->page; + this_num = min_t(unsigned, num, folio_size(folio) - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!PageUptodate(page) && !err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) { - zero_user_segment(page, this_num, PAGE_SIZE); - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && !err && offset == 0 && + (this_num == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, this_num, folio_size(folio)); + folio_mark_uptodate(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (err) goto out_iput; @@ -1703,7 +1714,7 @@ static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_retrieve_args *ra = container_of(args, typeof(*ra), ap.args); - release_pages(ra->ap.pages, ra->ap.num_pages); + release_pages(ra->ap.folios, ra->ap.num_folios); kfree(ra); } @@ -1717,7 +1728,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages; + unsigned int num_pages, cur_pages = 0; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1736,15 +1747,15 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); + args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) return -ENOMEM; ap = &ra->ap; - ap->pages = (void *) (ra + 1); - ap->descs = (void *) (ap->pages + num_pages); + ap->folios = (void *) (ra + 1); + ap->descs = (void *) (ap->folios + num_pages); args = &ap->args; args->nodeid = outarg->nodeid; @@ -1755,19 +1766,20 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && ap->num_pages < num_pages) { - struct page *page; + while (num && cur_pages < num_pages) { + struct folio *folio; unsigned int this_num; - page = find_get_page(mapping, index); - if (!page) + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].offset = offset; - ap->descs[ap->num_pages].length = this_num; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = this_num; + ap->num_folios++; + cur_pages++; offset = 0; num -= this_num; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54104dd48af7..494ac372ace0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -366,7 +366,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; - u64 attr_version; + u64 attr_version, evict_ctr; int err; *inode = NULL; @@ -381,6 +381,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out; attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); @@ -398,7 +399,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), - attr_version); + attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); @@ -691,7 +692,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -822,7 +823,7 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, ATTR_TIMEOUT(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -1585,13 +1586,13 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct page *page) +static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { - .num_pages = 1, - .pages = &page, + .num_folios = 1, + .folios = &folio, .descs = &desc, }; char *link; @@ -1614,7 +1615,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) if (WARN_ON(res >= PAGE_SIZE)) return -EIO; - link = page_address(page); + link = folio_address(folio); link[res] = '\0'; return 0; @@ -1624,7 +1625,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - struct page *page; + struct folio *folio; int err; err = -EIO; @@ -1638,20 +1639,20 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) goto out_err; - page = alloc_page(GFP_KERNEL); + folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; - if (!page) + if (!folio) goto out_err; - err = fuse_readlink_page(inode, page); + err = fuse_readlink_page(inode, folio); if (err) { - __free_page(page); + folio_put(folio); goto out_err; } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, &folio->page); - return page_address(page); + return folio_address(folio); out_err: return ERR_PTR(err); @@ -2028,7 +2029,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), - fuse_get_cache_mask(inode)); + fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) @@ -2231,7 +2232,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, &folio->page); + int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index dafdf766b1d5..88d0946b5bc9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -436,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) + if (idx_from >= curr_index + wpa->ia.ap.num_folios) n = n->rb_right; else if (idx_to < curr_index) n = n->rb_left; @@ -483,6 +483,21 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static inline bool fuse_folio_is_writeback(struct inode *inode, + struct folio *folio) +{ + pgoff_t last = folio_next_index(folio) - 1; + return fuse_range_is_writeback(inode, folio_index(folio), last); +} + +static void fuse_wait_on_folio_writeback(struct inode *inode, + struct folio *folio) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); +} + /* * Wait for all pending writepages on the inode to finish. * @@ -645,17 +660,20 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(ap->pages[i]); + folio_mark_dirty_lock(ap->folios[i]); if (ap->args.is_pinned) - unpin_user_page(ap->pages[i]); + unpin_folio(ap->folios[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -725,16 +743,16 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, - unsigned int npages) + unsigned int nfolios) { struct fuse_io_args *ia; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (ia) { ia->io = io; - ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, - &ia->ap.descs); - if (!ia->ap.pages) { + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.folios) { kfree(ia); ia = NULL; } @@ -744,7 +762,7 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, static void fuse_io_free(struct fuse_io_args *ia) { - kfree(ia->ap.pages); + kfree(ia->ap.folios); kfree(ia); } @@ -754,25 +772,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -843,33 +865,33 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { - loff_t pos = page_offset(ap->pages[0]) + num_read; + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = page_offset(page); - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio); + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.num_pages = 1, - .ap.pages = &page, + .ap.num_folios = 1, + .ap.folios = &folio, .ap.descs = &desc, }; ssize_t res; u64 attr_ver; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * With the temporary pages that are used to complete writeback, we can + * have writeback that extends beyond the lifetime of the folio. So + * make sure we read a properly synced folio. */ - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); attr_ver = fuse_get_attr_version(fm->fc); @@ -887,25 +909,24 @@ static int fuse_do_readpage(struct file *file, struct page *page) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } static int fuse_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int err; err = -EIO; if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); fuse_invalidate_atime(inode); out: - unlock_page(page); + folio_unlock(folio); return err; } @@ -919,8 +940,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < ap->num_pages; i++) - mapping = ap->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_folios; i++) + mapping = ap->folios[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -934,12 +955,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_pages; i++) { - struct folio *folio = page_folio(ap->pages[i]); - - folio_end_read(folio, !err); - folio_put(folio); - } + for (i = 0; i < ap->num_folios; i++) + folio_end_read(ap->folios[i], !err); if (ia->ff) fuse_file_put(ia->ff, false); @@ -951,8 +968,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; - loff_t pos = page_offset(ap->pages[0]); - size_t count = ap->num_pages << PAGE_SHIFT; + loff_t pos = folio_pos(ap->folios[0]); + /* Currently, all folios in FUSE are one page */ + size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -963,7 +981,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->descs[ap->num_pages - 1].length--; + ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -985,18 +1003,36 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int i, max_pages, nr_pages = 0; + unsigned int max_pages, nr_pages; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); - for (;;) { + /* + * This is only accurate the first time through, since readahead_folio() + * doesn't update readahead_count() from the previous folio until the + * next call. Grab nr_pages here so we know how many pages we're going + * to have to process. This means that we will exit here with + * readahead_count() == folio_nr_pages(last_folio), but we will have + * consumed all of the folios, and read_pages() will call + * readahead_folio() again which will clean up the rac. + */ + nr_pages = readahead_count(rac); + + while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; + struct folio *folio; + unsigned cur_pages = min(max_pages, nr_pages); if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1006,23 +1042,19 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - nr_pages = readahead_count(rac) - nr_pages; - if (nr_pages > max_pages) - nr_pages = max_pages; - if (nr_pages == 0) - break; - ia = fuse_io_alloc(NULL, nr_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - nr_pages = __readahead_batch(rac, ap->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); - ap->descs[i].length = PAGE_SIZE; + + while (ap->num_folios < cur_pages) { + folio = readahead_folio(rac); + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = folio_size(folio); + ap->num_folios++; } - ap->num_pages = nr_pages; fuse_send_readpages(ia, rac->file); + nr_pages -= cur_pages; } } @@ -1139,8 +1171,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, bool short_write; int err; - for (i = 0; i < ap->num_pages; i++) - fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + for (i = 0; i < ap->num_folios; i++) + fuse_wait_on_folio_writeback(inode, ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1154,24 +1186,24 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; if (err) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); } else { - if (count >= PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; else { if (short_write) - ClearPageUptodate(page); + folio_clear_uptodate(folio); count = 0; } offset = 0; } - if (ia->write.page_locked && (i == ap->num_pages - 1)) - unlock_page(page); - put_page(page); + if (ia->write.folio_locked && (i == ap->num_folios - 1)) + folio_unlock(folio); + folio_put(folio); } return err; @@ -1185,6 +1217,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); + unsigned int nr_pages = 0; size_t count = 0; int err; @@ -1193,7 +1226,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, do { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); @@ -1205,27 +1238,30 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (fault_in_iov_iter_readable(ii, bytes)) break; - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + flush_dcache_folio(folio); if (!tmp) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto again; } err = 0; - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = tmp; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = tmp; + ap->num_folios++; + nr_pages++; count += tmp; pos += tmp; @@ -1235,18 +1271,18 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full page, mark it uptodate */ if (tmp == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); } else { - ia->write.page_locked = true; + ia->write.folio_locked = true; break; } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - ap->num_pages < max_pages && offset == 0); + nr_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1280,8 +1316,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); - if (!ap->pages) { + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->folios) { err = -ENOMEM; break; } @@ -1303,7 +1339,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) err = -EIO; } } - kfree(ap->pages); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); @@ -1430,11 +1466,7 @@ writethrough: task_io_account_write(count); - err = file_remove_privs(file); - if (err) - goto out; - - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; @@ -1468,35 +1500,57 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } - while (nbytes < *nbytesp && ap->num_pages < max_pages) { - unsigned npages; + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; size_t start; - struct page **pt_pages; - pt_pages = &ap->pages[ap->num_pages]; - ret = iov_iter_extract_pages(ii, &pt_pages, + ret = iov_iter_extract_pages(ii, &pages, *nbytesp - nbytes, - max_pages - ap->num_pages, + max_pages - nr_pages, 0, &start); if (ret < 0) break; @@ -1504,16 +1558,25 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; ret += start; - npages = DIV_ROUND_UP(ret, PAGE_SIZE); + /* Currently, all folios in FUSE are one page */ + nfolios = DIV_ROUND_UP(ret, PAGE_SIZE); - ap->descs[ap->num_pages].offset = start; - fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + ap->descs[ap->num_folios].offset = start; + fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios); + for (i = 0; i < nfolios; i++) + ap->folios[i + ap->num_folios] = page_folio(pages[i]); - ap->num_pages += npages; - ap->descs[ap->num_pages - 1].length -= + ap->num_folios += nfolios; + ap->descs[ap->num_folios - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + nr_pages += nfolios; } + kfree(pages); + + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) @@ -1582,7 +1645,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1596,7 +1659,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; @@ -1650,7 +1713,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); @@ -1664,7 +1727,6 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; bool exclusive; @@ -1672,9 +1734,11 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) res = generic_write_checks(iocb, from); if (res > 0) { task_io_account_write(res); - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); @@ -1760,21 +1824,21 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + folio_put(ap->folios[i]); fuse_file_put(wpa->ia.ff, false); - kfree(ap->pages); + kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct page *page) +static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } @@ -1785,8 +1849,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_inode *fi = get_fuse_inode(inode); int i; - for (i = 0; i < ap->num_pages; i++) - fuse_writepage_finish_stat(inode, ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + fuse_writepage_finish_stat(inode, ap->folios[i]); wake_up(&fi->page_waitq); } @@ -1801,7 +1865,8 @@ __acquires(fi->lock) struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; - __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + /* Currently, all folios in FUSE are one page */ + __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; int err; fi->writectr++; @@ -1841,7 +1906,8 @@ __acquires(fi->lock) for (aux = wpa->next; aux; aux = next) { next = aux->next; aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]); + fuse_writepage_finish_stat(aux->inode, + aux->ia.ap.folios[0]); fuse_writepage_free(aux); } @@ -1876,11 +1942,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - WARN_ON(!wpa->ia.ap.num_pages); + WARN_ON(!wpa->ia.ap.num_folios); while (*p) { struct fuse_writepage_args *curr; pgoff_t curr_index; @@ -1891,7 +1957,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, WARN_ON(curr->inode != wpa->inode); curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + curr->ia.ap.num_pages) + if (idx_from >= curr_index + curr->ia.ap.num_folios) p = &(*p)->rb_right; else if (idx_to < curr_index) p = &(*p)->rb_left; @@ -2023,9 +2089,9 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) wpa = kzalloc(sizeof(*wpa), GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; - ap->num_pages = 0; - ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); - if (!ap->pages) { + ap->num_folios = 0; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->folios) { kfree(wpa); wpa = NULL; } @@ -2049,19 +2115,19 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t page_index) + struct folio *tmp_folio, uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; folio_copy(tmp_folio, folio); - ap->pages[page_index] = &tmp_folio->page; - ap->descs[page_index].offset = 0; - ap->descs[page_index].length = PAGE_SIZE; + ap->folios[folio_index] = tmp_folio; + ap->descs[folio_index].offset = 0; + ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2115,7 +2181,7 @@ static int fuse_writepage_locked(struct folio *folio) goto err_writepage_args; ap = &wpa->ia.ap; - ap->num_pages = 1; + ap->num_folios = 1; folio_start_writeback(folio); fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); @@ -2143,32 +2209,32 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; - unsigned int max_pages; + struct folio **orig_folios; + unsigned int max_folios; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct fuse_conn *fc = get_fuse_conn(data->inode); - struct page **pages; - struct fuse_page_desc *descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, data->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), fc->max_pages); - WARN_ON(npages <= data->max_pages); + WARN_ON(nfolios <= data->max_folios); - pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); - if (!pages) + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) return false; - memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); - memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); - kfree(ap->pages); - ap->pages = pages; + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; ap->descs = descs; - data->max_pages = npages; + data->max_folios = nfolios; return true; } @@ -2178,7 +2244,7 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; + int num_folios = wpa->ia.ap.num_folios; int i; spin_lock(&fi->lock); @@ -2186,8 +2252,8 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + for (i = 0; i < num_folios; i++) + folio_end_writeback(data->orig_folios[i]); } /* @@ -2198,15 +2264,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * swapping the new temp page with the old one. */ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) + struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; struct fuse_writepage_args *old_wpa; struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; + WARN_ON(new_ap->num_folios != 0); + new_ap->num_folios = 1; spin_lock(&fi->lock); old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); @@ -2220,9 +2286,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, WARN_ON(tmp->inode != new_wpa->inode); curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + if (curr_index == folio->index) { + WARN_ON(tmp->ia.ap.num_folios != 1); + swap(tmp->ia.ap.folios[0], new_ap->folios[0]); break; } } @@ -2235,18 +2301,19 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]); + fuse_writepage_finish_stat(new_wpa->inode, + folio); fuse_writepage_free(new_wpa); } return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_pages); + WARN_ON(!ap->num_folios); /* * Being under writeback is unlikely but possible. For example direct @@ -2254,23 +2321,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (fuse_folio_is_writeback(data->inode, folio)) return true; /* Reached max pages */ - if (ap->num_pages == fc->max_pages) + if (ap->num_folios == fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) return true; return false; @@ -2295,7 +2362,7 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2314,7 +2381,7 @@ static int fuse_writepages_fill(struct folio *folio, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. + * request to the fi->writepages list and increment ap->num_folios. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ @@ -2326,13 +2393,13 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } fuse_file_get(wpa->ia.ff); - data->max_pages = 1; + data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages); - data->orig_pages[ap->num_pages] = &folio->page; + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); + data->orig_folios[ap->num_folios] = folio; err = 0; if (data->wpa) { @@ -2341,9 +2408,9 @@ static int fuse_writepages_fill(struct folio *folio, * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - ap->num_pages++; + ap->num_folios++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + } else if (fuse_writepage_add(wpa, folio)) { data->wpa = wpa; } else { folio_end_writeback(folio); @@ -2375,21 +2442,21 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) + data.orig_folios = kcalloc(fc->max_pages, + sizeof(struct folio *), + GFP_NOFS); + if (!data.orig_folios) goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_pages); + WARN_ON(!data.wpa->ia.ap.num_folios); fuse_writepages_send(&data); } if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_pages); + kfree(data.orig_folios); out: return err; } @@ -2429,7 +2496,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, &folio->page); + err = fuse_do_readfolio(file, folio); if (err) goto cleanup; success: @@ -2518,17 +2585,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); return VM_FAULT_LOCKED; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6cc3d552b13..74744c6f2860 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -35,9 +35,6 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -47,6 +44,9 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -285,8 +285,8 @@ struct fuse_arg { void *value; }; -/** FUSE page descriptor */ -struct fuse_page_desc { +/** FUSE folio descriptor */ +struct fuse_folio_desc { unsigned int length; unsigned int offset; }; @@ -309,16 +309,19 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool is_pinned:1; + bool invalidate_vmap:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { struct fuse_args args; - struct page **pages; - struct fuse_page_desc *descs; - unsigned int num_pages; + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int num_folios; }; struct fuse_release_args { @@ -857,6 +860,9 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -884,6 +890,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Version counter for evict inode */ + atomic64_t evict_ctr; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -978,6 +987,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc) +{ + return atomic64_read(&fc->evict_ctr); +} + static inline bool fuse_stale_inode(const struct inode *inode, int generation, struct fuse_attr *attr) { @@ -995,25 +1009,25 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } -static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) +static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, + struct fuse_folio_desc **desc) { - struct page **pages; + struct folio **folios; - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); + folios = kzalloc(nfolios * (sizeof(struct folio *) + + sizeof(struct fuse_folio_desc)), flags); + *desc = (void *) (folios + nfolios); - return pages; + return folios; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) +static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs, + unsigned int index, + unsigned int nr_folios) { int i; - for (i = index; i < index + nr_pages; i++) + for (i = index; i < index + nr_folios; i++) descs[i].length = PAGE_SIZE - descs[i].offset; } @@ -1037,7 +1051,8 @@ extern const struct dentry_operations fuse_root_dentry_operations; */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + u64 evict_ctr); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); @@ -1062,7 +1077,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; - bool page_locked; + bool folio_locked; } write; }; struct fuse_args_pages ap; @@ -1127,7 +1142,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask); + u64 attr_valid, u32 cache_mask, + u64 evict_ctr); u32 fuse_get_cache_mask(struct inode *inode); @@ -1480,4 +1496,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd3321e29a3e..3ce4f4e81d09 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,6 +35,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -173,6 +175,14 @@ static void fuse_evict_inode(struct inode *inode) fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(fi->iocachectr != 0); @@ -206,17 +216,30 @@ static ino_t fuse_squash_ino(u64 ino64) void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask) + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - /* Clear basic stats from invalid mask */ - set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -295,9 +318,9 @@ u32 fuse_get_cache_mask(struct inode *inode) return STATX_MTIME | STATX_CTIME | STATX_SIZE; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - struct fuse_statx *sx, - u64 attr_valid, u64 attr_version) +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -331,7 +354,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode_get_mtime(inode); - fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -372,6 +396,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { @@ -426,7 +457,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; @@ -487,8 +519,8 @@ retry: fi->nlookup++; spin_unlock(&fi->lock); done: - fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); - + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } @@ -940,11 +972,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -1001,7 +1034,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -1610,7 +1643,8 @@ static int fuse_fill_super_submount(struct super_block *sb, return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); - root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does @@ -2063,8 +2097,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2075,6 +2115,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 572ce8a82ceb..2d9abf48828f 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -10,6 +10,8 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 + static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) { @@ -140,7 +142,7 @@ static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, { struct fsverity_enable_arg enable; struct fsverity_enable_arg __user *uarg = (void __user *)arg; - const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE; + const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; if (copy_from_user(&enable, uarg, sizeof(enable))) return -EFAULT; @@ -249,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) + if (!ap.folios || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -304,14 +306,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) + while (ap.num_folios < max_pages) { + ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); + if (!ap.folios[ap.num_folios]) goto out; - ap.num_pages++; + ap.num_folios++; } - /* okay, let's send it to the client */ ap.args.opcode = FUSE_IOCTL; ap.args.nodeid = ff->nodeid; @@ -325,8 +326,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } @@ -364,7 +365,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_local_page(ap.pages[0]); + vaddr = kmap_local_folio(ap.folios[0], 0); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -392,17 +393,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); + while (ap.num_folios) + folio_put(ap.folios[--ap.num_folios]); + kfree(ap.folios); return err ? err : outarg.result; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c8..17ce9636a2b1 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, - u64 attr_version) + u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; @@ -233,7 +233,7 @@ retry: } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), - attr_version); + attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid) } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) + struct dir_context *ctx, u64 attr_version, + u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, buf += reclen; nbytes -= reclen; - ret = fuse_direntplus_link(file, direntplus, attr_version); + ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } @@ -331,26 +332,27 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct page *page; + struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; - struct fuse_page_desc desc = { .length = PAGE_SIZE }; - u64 attr_version = 0; + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + u64 attr_version = 0, evict_ctr = 0; bool locked; - page = alloc_page(GFP_KERNEL); - if (!page) + folio = folio_alloc(GFP_KERNEL, 0); + if (!folio) return -ENOMEM; plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &page; + ap->num_folios = 1; + ap->folios = &folio; ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -367,15 +369,16 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(page_address(page), res, - file, ctx, attr_version); + res = parse_dirplusfile(folio_address(folio), res, + file, ctx, attr_version, + evict_ctr); } else { - res = parse_dirfile(page_address(page), res, file, + res = parse_dirfile(folio_address(folio), res, file, ctx); } } - __free_page(page); + folio_put(folio); fuse_invalidate_atime(inode); return res; } diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 000000000000..b272bb333005 --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include <linux/sysctl.h> + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +static struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6404a189e989..d88d3fc5306a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,7 +97,8 @@ struct virtio_fs_req_work { }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight); + struct fuse_req *req, bool in_flight, + gfp_t gfp); static const struct constant_table dax_param_enums[] = { {"always", FUSE_DAX_ALWAYS }, @@ -575,6 +576,8 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) /* Dispatch pending requests */ while (1) { + unsigned int flags; + spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->queued_reqs, struct fuse_req, list); @@ -585,7 +588,9 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - ret = virtio_fs_enqueue_req(fsvq, req, true); + flags = memalloc_nofs_save(); + ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL); + memalloc_nofs_restore(flags); if (ret < 0) { if (ret == -ENOSPC) { spin_lock(&fsvq->lock); @@ -686,7 +691,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } /* Allocate and copy args into req->argbuf */ -static int copy_args_to_argbuf(struct fuse_req *req) +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp) { struct fuse_args *args = req->args; unsigned int offset = 0; @@ -700,7 +705,7 @@ static int copy_args_to_argbuf(struct fuse_req *req) len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + fuse_len_args(num_out, args->out_args); - req->argbuf = kmalloc(len, GFP_ATOMIC); + req->argbuf = kmalloc(len, gfp); if (!req->argbuf) return -ENOMEM; @@ -760,7 +765,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; - struct page *page; + struct folio *folio; /* * TODO verify that server properly follows FUSE protocol @@ -772,12 +777,12 @@ static void virtio_fs_request_complete(struct fuse_req *req, if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { thislen = ap->descs[i].length; if (len < thislen) { WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); + folio = ap->folios[i]; + folio_zero_segment(folio, len, thislen); len = 0; } else { len -= thislen; @@ -1267,15 +1272,15 @@ static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *r } /* Count number of scatter-gather elements required */ -static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { - this_len = min(page_descs[i].length, total_len); + for (i = 0; i < num_folios && total_len; i++) { + this_len = min(folio_descs[i].length, total_len); total_len -= this_len; } @@ -1294,8 +1299,8 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } if (!test_bit(FR_ISREPLY, &req->flags)) @@ -1308,27 +1313,27 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } return total_sgs; } -/* Add pages to scatter-gather list and return number of elements used */ -static unsigned int sg_init_fuse_pages(struct scatterlist *sg, - struct page **pages, - struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +/* Add folios to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_folios(struct scatterlist *sg, + struct folio **folios, + struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { + for (i = 0; i < num_folios && total_len; i++) { sg_init_table(&sg[i], 1); - this_len = min(page_descs[i].length, total_len); - sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + this_len = min(folio_descs[i].length, total_len); + sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset); total_len -= this_len; } @@ -1353,10 +1358,10 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) - total_sgs += sg_init_fuse_pages(&sg[total_sgs], - ap->pages, ap->descs, - ap->num_pages, - args[numargs - 1].size); + total_sgs += sg_init_fuse_folios(&sg[total_sgs], + ap->folios, ap->descs, + ap->num_folios, + args[numargs - 1].size); if (len_used) *len_used = len; @@ -1366,7 +1371,8 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight) + struct fuse_req *req, bool in_flight, + gfp_t gfp) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; @@ -1387,8 +1393,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { - sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); - sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); if (!sgs || !sg) { ret = -ENOMEM; goto out; @@ -1396,7 +1402,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } /* Use a bounce buffer since stack args cannot be mapped */ - ret = copy_args_to_argbuf(req); + ret = copy_args_to_argbuf(req, gfp); if (ret < 0) goto out; @@ -1490,7 +1496,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) queue_id); fsvq = &fs->vqs[queue_id]; - ret = virtio_fs_enqueue_req(fsvq, req, false); + ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC); if (ret < 0) { if (ret == -ENOSPC) { /* @@ -1691,6 +1697,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, diff --git a/include/linux/mm.h b/include/linux/mm.h index 2bbf73eb53e7..c39c4945946c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2550,6 +2550,7 @@ struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 80746182e9e8..1d1832e2a599 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -52,6 +52,12 @@ bool set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); +int set_page_dirty_lock(struct page *page) +{ + return folio_mark_dirty_lock(page_folio(page)); +} +EXPORT_SYMBOL(set_page_dirty_lock); + bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fdb89ce85fff..d213ead95675 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2925,25 +2925,25 @@ bool folio_mark_dirty(struct folio *folio) EXPORT_SYMBOL(folio_mark_dirty); /* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. + * folio_mark_dirty() is racy if the caller has no reference against + * folio->mapping->host, and if the folio is unlocked. This is because another + * CPU could truncate the folio off the mapping and then free the mapping. * - * Usually, the page _is_ locked, or the caller is a user-space process which + * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * - * In other cases, the page should be locked before running set_page_dirty(). + * In other cases, the folio should be locked before running folio_mark_dirty(). */ -int set_page_dirty_lock(struct page *page) +bool folio_mark_dirty_lock(struct folio *folio) { - int ret; + bool ret; - lock_page(page); - ret = set_page_dirty(page); - unlock_page(page); + folio_lock(folio); + ret = folio_mark_dirty(folio); + folio_unlock(folio); return ret; } -EXPORT_SYMBOL(set_page_dirty_lock); +EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT |